diff --git a/src/engines/brave.rs b/src/engines/brave.rs index f33b254..cb594eb 100644 --- a/src/engines/brave.rs +++ b/src/engines/brave.rs @@ -1,27 +1,19 @@ // Search engine parser for Brave Search // This uses the clearnet, unlocalized version of the search engine. pub mod brave { - use std::sync::Arc; - - use futures::lock::Mutex; use lazy_static::lazy_static; use regex::Regex; use tokio::sync::mpsc::Sender; - use urlencoding::decode; use crate::{ - engines::engine_base::engine_base::{ - EngineBase, EnginePositions, ResultsCollector, SearchEngine, SearchResult, - }, + engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult}, helpers::helpers::build_default_client, - utils::utils::decode_html_text, }; lazy_static! { static ref RESULTS_START: Regex = Regex::new(r#"(?P.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap(); static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); - static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap(); } #[derive(Clone, Debug)] @@ -31,49 +23,13 @@ pub mod brave { impl EngineBase for Brave { fn parse_next<'a>(&mut self) -> Option<SearchResult> { - if self.positions.started { - if let Some(capture) = - SINGLE_RESULT.captures(&self.positions.previous_block.to_owned()) - { - let title = decode(capture.name("title").unwrap().as_str()) - .unwrap() - .into_owned(); - let description_raw = - decode_html_text(capture.name("description").unwrap().as_str()).unwrap(); - let description = STRIP_HTML_TAGS - .replace_all(&description_raw, "") - .into_owned(); - let url = decode(capture.name("url").unwrap().as_str()) - .unwrap() - .into_owned(); - - let result = SearchResult { - title, - description, - url, - engine: SearchEngine::DuckDuckGo, - }; - - let end_position = capture.get(0).unwrap().end(); - self.positions.slice_remaining_block(&end_position); - - return Some(result); - } - } - - None + self.positions + .handle_block_using_default_method(&SINGLE_RESULT) } fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) { - let bytes: Vec<u8> = packet.map(|bit| *bit).collect(); - let raw_text = String::from_utf8_lossy(&bytes); - let text = STRIP.replace_all(&raw_text, " "); - - if self.positions.started { - self.positions.previous_block.push_str(&text); - } else { - self.positions.started = RESULTS_START.is_match(&text); - } + self.positions + .handle_start_check_using_default_method(&RESULTS_START, packet) } } @@ -84,13 +40,13 @@ pub mod brave { } } - pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) { + pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> { let client = build_default_client(); let request = client .get(format!("https://search.brave.com/search?q={}", query)) .send(); - self.handle_request(request, tx).await; + self.handle_request(request, tx).await } } } diff --git a/src/engines/engine_base.rs b/src/engines/engine_base.rs index 23e1eaa..792efc0 100644 --- a/src/engines/engine_base.rs +++ b/src/engines/engine_base.rs @@ -6,17 +6,22 @@ pub mod engine_base { use regex::Regex; use reqwest::{Error, Response}; use tokio::sync::mpsc::Sender; + use urlencoding::decode; + + use crate::utils::utils::decode_html_text; lazy_static! { - static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); + static ref STRIP: Regex = Regex::new(r"[\s\n]+").unwrap(); + static ref STRIP_HTML_TAGS: Regex = + Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap(); } - #[derive(Clone, Copy, Debug, Hash)] + #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum SearchEngine { DuckDuckGo, } - #[derive(Clone, Debug, Hash)] + #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct SearchResult { pub title: String, pub url: String, @@ -52,7 +57,7 @@ pub mod engine_base { &mut self, request: impl Future<Output = Result<Response, Error>>, tx: Sender<SearchResult>, - ) { + ) -> Result<(), ()> { let mut stream = request.await.unwrap().bytes_stream(); while let Some(chunk) = stream.next().await { @@ -61,13 +66,19 @@ pub mod engine_base { self.push_packet(buffer.iter()); while let Some(result) = self.parse_next() { - tx.send(result).await; + if tx.send(result).await.is_err() { + return Err(()); + } } } while let Some(result) = self.parse_next() { - tx.send(result).await; + if tx.send(result).await.is_err() { + return Err(()); + } } + + Ok(()) } } @@ -132,5 +143,57 @@ pub mod engine_base { self.previous_block.clear(); self.previous_block.push_str(&remaining_text); } + + pub fn handle_start_check_using_default_method<'a>( + &mut self, + results_start_regex: &Regex, + packet: impl Iterator<Item = &'a u8>, + ) { + let bytes: Vec<u8> = packet.map(|bit| *bit).collect(); + let raw_text = String::from_utf8_lossy(&bytes); + let text = STRIP.replace_all(&raw_text, " "); + + if self.started { + self.previous_block.push_str(&text); + } else { + self.started = results_start_regex.is_match(&text); + } + } + + pub fn handle_block_using_default_method( + &mut self, + single_result_regex: &Regex, + ) -> Option<SearchResult> { + if self.started { + if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned()) + { + let title = decode(capture.name("title").unwrap().as_str()) + .unwrap() + .into_owned(); + let description_raw = + decode_html_text(capture.name("description").unwrap().as_str()).unwrap(); + let description = STRIP_HTML_TAGS + .replace_all(&description_raw, "") + .into_owned(); + let url = decode(capture.name("url").unwrap().as_str()) + .unwrap() + .into_owned(); + + let result = SearchResult { + title, + description, + url, + engine: SearchEngine::DuckDuckGo, + }; + + let end_position = capture.get(0).unwrap().end(); + self.slice_remaining_block(&end_position); + + return Some(result); + } + } + + None + } } } diff --git a/src/main.rs b/src/main.rs index 97d283d..42e812b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ use std::sync::Arc; use std::{str, thread}; use engines::brave::brave::Brave; +use engines::duckduckgo::duckduckgo::DuckDuckGo; use engines::engine_base::engine_base::{ResultsCollector, SearchResult}; use futures::lock::Mutex; use lazy_static::lazy_static;