fix: cleanup; improve code quality

This commit is contained in:
Myzel394 2024-02-21 09:41:29 +01:00
parent 5d076328ec
commit c19371d079
No known key found for this signature in database
GPG Key ID: DEC4AAB876F73185
3 changed files with 77 additions and 57 deletions

View File

@ -1,27 +1,19 @@
// Search engine parser for Brave Search // Search engine parser for Brave Search
// This uses the clearnet, unlocalized version of the search engine. // This uses the clearnet, unlocalized version of the search engine.
pub mod brave { pub mod brave {
use std::sync::Arc;
use futures::lock::Mutex;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
use urlencoding::decode;
use crate::{ use crate::{
engines::engine_base::engine_base::{ engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
EngineBase, EnginePositions, ResultsCollector, SearchEngine, SearchResult,
},
helpers::helpers::build_default_client, helpers::helpers::build_default_client,
utils::utils::decode_html_text,
}; };
lazy_static! { lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap(); static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap(); static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@ -31,49 +23,13 @@ pub mod brave {
impl EngineBase for Brave { impl EngineBase for Brave {
fn parse_next<'a>(&mut self) -> Option<SearchResult> { fn parse_next<'a>(&mut self) -> Option<SearchResult> {
if self.positions.started { self.positions
if let Some(capture) = .handle_block_using_default_method(&SINGLE_RESULT)
SINGLE_RESULT.captures(&self.positions.previous_block.to_owned())
{
let title = decode(capture.name("title").unwrap().as_str())
.unwrap()
.into_owned();
let description_raw =
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
let description = STRIP_HTML_TAGS
.replace_all(&description_raw, "")
.into_owned();
let url = decode(capture.name("url").unwrap().as_str())
.unwrap()
.into_owned();
let result = SearchResult {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
};
let end_position = capture.get(0).unwrap().end();
self.positions.slice_remaining_block(&end_position);
return Some(result);
}
}
None
} }
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) { fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect(); self.positions
let raw_text = String::from_utf8_lossy(&bytes); .handle_start_check_using_default_method(&RESULTS_START, packet)
let text = STRIP.replace_all(&raw_text, " ");
if self.positions.started {
self.positions.previous_block.push_str(&text);
} else {
self.positions.started = RESULTS_START.is_match(&text);
}
} }
} }
@ -84,13 +40,13 @@ pub mod brave {
} }
} }
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) { pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
let client = build_default_client(); let client = build_default_client();
let request = client let request = client
.get(format!("https://search.brave.com/search?q={}", query)) .get(format!("https://search.brave.com/search?q={}", query))
.send(); .send();
self.handle_request(request, tx).await; self.handle_request(request, tx).await
} }
} }
} }

View File

@ -6,17 +6,22 @@ pub mod engine_base {
use regex::Regex; use regex::Regex;
use reqwest::{Error, Response}; use reqwest::{Error, Response};
use tokio::sync::mpsc::Sender; use tokio::sync::mpsc::Sender;
use urlencoding::decode;
use crate::utils::utils::decode_html_text;
lazy_static! { lazy_static! {
static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); static ref STRIP: Regex = Regex::new(r"[\s\n]+").unwrap();
static ref STRIP_HTML_TAGS: Regex =
Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
} }
#[derive(Clone, Copy, Debug, Hash)] #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub enum SearchEngine { pub enum SearchEngine {
DuckDuckGo, DuckDuckGo,
} }
#[derive(Clone, Debug, Hash)] #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct SearchResult { pub struct SearchResult {
pub title: String, pub title: String,
pub url: String, pub url: String,
@ -52,7 +57,7 @@ pub mod engine_base {
&mut self, &mut self,
request: impl Future<Output = Result<Response, Error>>, request: impl Future<Output = Result<Response, Error>>,
tx: Sender<SearchResult>, tx: Sender<SearchResult>,
) { ) -> Result<(), ()> {
let mut stream = request.await.unwrap().bytes_stream(); let mut stream = request.await.unwrap().bytes_stream();
while let Some(chunk) = stream.next().await { while let Some(chunk) = stream.next().await {
@ -61,13 +66,19 @@ pub mod engine_base {
self.push_packet(buffer.iter()); self.push_packet(buffer.iter());
while let Some(result) = self.parse_next() { while let Some(result) = self.parse_next() {
tx.send(result).await; if tx.send(result).await.is_err() {
return Err(());
}
} }
} }
while let Some(result) = self.parse_next() { while let Some(result) = self.parse_next() {
tx.send(result).await; if tx.send(result).await.is_err() {
return Err(());
}
} }
Ok(())
} }
} }
@ -132,5 +143,57 @@ pub mod engine_base {
self.previous_block.clear(); self.previous_block.clear();
self.previous_block.push_str(&remaining_text); self.previous_block.push_str(&remaining_text);
} }
pub fn handle_start_check_using_default_method<'a>(
&mut self,
results_start_regex: &Regex,
packet: impl Iterator<Item = &'a u8>,
) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
let raw_text = String::from_utf8_lossy(&bytes);
let text = STRIP.replace_all(&raw_text, " ");
if self.started {
self.previous_block.push_str(&text);
} else {
self.started = results_start_regex.is_match(&text);
}
}
pub fn handle_block_using_default_method(
&mut self,
single_result_regex: &Regex,
) -> Option<SearchResult> {
if self.started {
if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned())
{
let title = decode(capture.name("title").unwrap().as_str())
.unwrap()
.into_owned();
let description_raw =
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
let description = STRIP_HTML_TAGS
.replace_all(&description_raw, "")
.into_owned();
let url = decode(capture.name("url").unwrap().as_str())
.unwrap()
.into_owned();
let result = SearchResult {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
};
let end_position = capture.get(0).unwrap().end();
self.slice_remaining_block(&end_position);
return Some(result);
}
}
None
}
} }
} }

View File

@ -2,6 +2,7 @@ use std::sync::Arc;
use std::{str, thread}; use std::{str, thread};
use engines::brave::brave::Brave; use engines::brave::brave::Brave;
use engines::duckduckgo::duckduckgo::DuckDuckGo;
use engines::engine_base::engine_base::{ResultsCollector, SearchResult}; use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
use futures::lock::Mutex; use futures::lock::Mutex;
use lazy_static::lazy_static; use lazy_static::lazy_static;