fix: cleanup; improve code quality

This commit is contained in:
Myzel394 2024-02-21 09:41:29 +01:00
parent 5d076328ec
commit c19371d079
No known key found for this signature in database
GPG Key ID: DEC4AAB876F73185
3 changed files with 77 additions and 57 deletions

View File

@ -1,27 +1,19 @@
// Search engine parser for Brave Search
// This uses the clearnet, unlocalized version of the search engine.
pub mod brave {
use std::sync::Arc;
use futures::lock::Mutex;
use lazy_static::lazy_static;
use regex::Regex;
use tokio::sync::mpsc::Sender;
use urlencoding::decode;
use crate::{
engines::engine_base::engine_base::{
EngineBase, EnginePositions, ResultsCollector, SearchEngine, SearchResult,
},
engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
helpers::helpers::build_default_client,
utils::utils::decode_html_text,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
}
#[derive(Clone, Debug)]
@ -31,49 +23,13 @@ pub mod brave {
impl EngineBase for Brave {
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
if self.positions.started {
if let Some(capture) =
SINGLE_RESULT.captures(&self.positions.previous_block.to_owned())
{
let title = decode(capture.name("title").unwrap().as_str())
.unwrap()
.into_owned();
let description_raw =
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
let description = STRIP_HTML_TAGS
.replace_all(&description_raw, "")
.into_owned();
let url = decode(capture.name("url").unwrap().as_str())
.unwrap()
.into_owned();
let result = SearchResult {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
};
let end_position = capture.get(0).unwrap().end();
self.positions.slice_remaining_block(&end_position);
return Some(result);
}
}
None
self.positions
.handle_block_using_default_method(&SINGLE_RESULT)
}
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
let raw_text = String::from_utf8_lossy(&bytes);
let text = STRIP.replace_all(&raw_text, " ");
if self.positions.started {
self.positions.previous_block.push_str(&text);
} else {
self.positions.started = RESULTS_START.is_match(&text);
}
self.positions
.handle_start_check_using_default_method(&RESULTS_START, packet)
}
}
@ -84,13 +40,13 @@ pub mod brave {
}
}
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) {
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
let client = build_default_client();
let request = client
.get(format!("https://search.brave.com/search?q={}", query))
.send();
self.handle_request(request, tx).await;
self.handle_request(request, tx).await
}
}
}

View File

@ -6,17 +6,22 @@ pub mod engine_base {
use regex::Regex;
use reqwest::{Error, Response};
use tokio::sync::mpsc::Sender;
use urlencoding::decode;
use crate::utils::utils::decode_html_text;
lazy_static! {
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
static ref STRIP: Regex = Regex::new(r"[\s\n]+").unwrap();
static ref STRIP_HTML_TAGS: Regex =
Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
}
#[derive(Clone, Copy, Debug, Hash)]
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub enum SearchEngine {
DuckDuckGo,
}
#[derive(Clone, Debug, Hash)]
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct SearchResult {
pub title: String,
pub url: String,
@ -52,7 +57,7 @@ pub mod engine_base {
&mut self,
request: impl Future<Output = Result<Response, Error>>,
tx: Sender<SearchResult>,
) {
) -> Result<(), ()> {
let mut stream = request.await.unwrap().bytes_stream();
while let Some(chunk) = stream.next().await {
@ -61,14 +66,20 @@ pub mod engine_base {
self.push_packet(buffer.iter());
while let Some(result) = self.parse_next() {
tx.send(result).await;
if tx.send(result).await.is_err() {
return Err(());
}
}
}
while let Some(result) = self.parse_next() {
tx.send(result).await;
if tx.send(result).await.is_err() {
return Err(());
}
}
Ok(())
}
}
impl ResultsCollector {
@ -132,5 +143,57 @@ pub mod engine_base {
self.previous_block.clear();
self.previous_block.push_str(&remaining_text);
}
pub fn handle_start_check_using_default_method<'a>(
&mut self,
results_start_regex: &Regex,
packet: impl Iterator<Item = &'a u8>,
) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
let raw_text = String::from_utf8_lossy(&bytes);
let text = STRIP.replace_all(&raw_text, " ");
if self.started {
self.previous_block.push_str(&text);
} else {
self.started = results_start_regex.is_match(&text);
}
}
pub fn handle_block_using_default_method(
&mut self,
single_result_regex: &Regex,
) -> Option<SearchResult> {
if self.started {
if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned())
{
let title = decode(capture.name("title").unwrap().as_str())
.unwrap()
.into_owned();
let description_raw =
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
let description = STRIP_HTML_TAGS
.replace_all(&description_raw, "")
.into_owned();
let url = decode(capture.name("url").unwrap().as_str())
.unwrap()
.into_owned();
let result = SearchResult {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
};
let end_position = capture.get(0).unwrap().end();
self.slice_remaining_block(&end_position);
return Some(result);
}
}
None
}
}
}

View File

@ -2,6 +2,7 @@ use std::sync::Arc;
use std::{str, thread};
use engines::brave::brave::Brave;
use engines::duckduckgo::duckduckgo::DuckDuckGo;
use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
use futures::lock::Mutex;
use lazy_static::lazy_static;