From a230e8b9db9abc2153d6efad93c2116214fc547c Mon Sep 17 00:00:00 2001 From: Myzel394 <50424412+Myzel394@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:07:19 +0100 Subject: [PATCH] feat: Improve bing; Add concurrent search engine --- src/engines.rs | 1 + src/engines/bing.rs | 53 +++++++++++++++ src/engines/brave.rs | 7 +- src/engines/duckduckgo.rs | 128 ++++++------------------------------- src/engines/engine_base.rs | 65 +++++-------------- src/helpers.rs | 8 +-- src/main.rs | 28 ++++++-- 7 files changed, 117 insertions(+), 173 deletions(-) create mode 100644 src/engines/bing.rs diff --git a/src/engines.rs b/src/engines.rs index 9311cee..cb584d1 100644 --- a/src/engines.rs +++ b/src/engines.rs @@ -1,3 +1,4 @@ +pub mod bing; pub mod brave; pub mod duckduckgo; pub mod engine_base; diff --git a/src/engines/bing.rs b/src/engines/bing.rs new file mode 100644 index 0000000..7f73f23 --- /dev/null +++ b/src/engines/bing.rs @@ -0,0 +1,53 @@ +// Search engine parser for Brave Search +// This uses the clearnet, unlocalized version of the search engine. +pub mod bing { + use lazy_static::lazy_static; + use regex::Regex; + use tokio::sync::mpsc::Sender; + + use crate::{ + engines::engine_base::engine_base::{ + EngineBase, EnginePositions, SearchEngine, SearchResult, + }, + helpers::helpers::build_default_client, + }; + + lazy_static! { + static ref RESULTS_START: Regex = Regex::new(r#"id="b_results""#).unwrap(); + static ref SINGLE_RESULT: Regex = Regex::new(r#"
  • (?P.+?)</a></h2>.*?((<div class="b_caption.*?<p.*?)|(<p class="b_lineclamp3.*?))><span.*?</span>(?P<description>.*?)</p>.*?</li>"#).unwrap(); + } + + #[derive(Clone, Debug)] + pub struct Bing { + positions: EnginePositions, + } + + impl EngineBase for Bing { + fn parse_next<'a>(&mut self) -> Option<SearchResult> { + self.positions + .handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Bing) + } + + fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) { + self.positions + .handle_start_check_using_default_method(&RESULTS_START, packet) + } + } + + impl Bing { + pub fn new() -> Self { + Self { + positions: EnginePositions::new(), + } + } + + pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> { + let client = build_default_client(); + let request = client + .get(format!("https://www.bing.com/search?q={}", query)) + .send(); + + self.handle_request(request, tx).await + } + } +} diff --git a/src/engines/brave.rs b/src/engines/brave.rs index cb594eb..ac72a35 100644 --- a/src/engines/brave.rs +++ b/src/engines/brave.rs @@ -6,14 +6,15 @@ pub mod brave { use tokio::sync::mpsc::Sender; use crate::{ - engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult}, + engines::engine_base::engine_base::{ + EngineBase, EnginePositions, SearchEngine, SearchResult, + }, helpers::helpers::build_default_client, }; lazy_static! { static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap(); static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap(); - static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); } #[derive(Clone, Debug)] @@ -24,7 +25,7 @@ pub mod brave { impl EngineBase for Brave { fn parse_next<'a>(&mut self) -> Option<SearchResult> { self.positions - .handle_block_using_default_method(&SINGLE_RESULT) + .handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Brave) } fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) { diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs index fbe8866..b0efac4 100644 --- a/src/engines/duckduckgo.rs +++ b/src/engines/duckduckgo.rs @@ -1,141 +1,53 @@ -// Search engine parser for DuckDuckGo +// Search engine parser for DuckDuckGo Search pub mod duckduckgo { use lazy_static::lazy_static; use regex::Regex; - use urlencoding::decode; + use tokio::sync::mpsc::Sender; use crate::{ - engines::engine_base::engine_base::{EngineBase, SearchEngine, SearchResult}, - utils::utils::decode_html_text, + engines::engine_base::engine_base::{ + EngineBase, EnginePositions, SearchEngine, SearchResult, + }, + helpers::helpers::build_default_client, }; lazy_static! { static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap(); static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result results_links.*?<a.*?href="(?P<url>.*?)".*?>(?P<title>.*?)</a>.*?class="result__snippet".*?>(?P<description>.*?)</a>.*?class="clear".*?</div>(?P<end> </div>){2}"#).unwrap(); - static ref STRIP: Regex = Regex::new(r"\s+").unwrap(); - static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap(); } - pub type CallbackType = Box<dyn FnMut(SearchResult) -> () + Send + Sync>; + const URL: &str = "https://html.duckduckgo.com/html"; + #[derive(Clone, Debug)] pub struct DuckDuckGo { - callback: CallbackType, - pub completed: bool, - results_started: bool, - pub previous_block: String, - // Holds all results until consumed by iterator - pub results: Vec<SearchResult>, + positions: EnginePositions, } - // impl Stream for DuckDuckGo { - // type Item = String; - // - // fn poll_next( - // self: Pin<&mut Self>, - // cx: &mut Context<'_>, - // ) -> std::task::Poll<Option<Self::Item>> { - // if self.results.len() > 0 { - // let result = &mut self.results.pop_front().unwrap(); - // - // let html = format!("<br><h2>{}</h2><p>{}</p>", result.title, result.description); - // - // return Poll::Ready(Some(html)); - // } - // - // if self.completed { - // return Poll::Ready(None); - // } - // - // Poll::Pending - // } - // } - - // impl Iterator for DuckDuckGo { - // type Item = SearchResult; - // - // fn next(&mut self) -> Option<SearchResult> { - // if self.results.len() > 0 { - // let oldest = self.results.pop_front().unwrap(); - // - // Some(oldest) - // } else { - // None - // } - // } - // } - impl EngineBase for DuckDuckGo { fn parse_next<'a>(&mut self) -> Option<SearchResult> { - if self.results_started { - match SINGLE_RESULT.captures(&self.previous_block.to_owned()) { - Some(captures) => { - let title = decode(captures.name("title").unwrap().as_str()) - .unwrap() - .into_owned(); - let description_raw = - decode_html_text(captures.name("description").unwrap().as_str()) - .unwrap(); - let description = STRIP_HTML_TAGS - .replace_all(&description_raw, "") - .into_owned(); - let url = decode(captures.name("url").unwrap().as_str()) - .unwrap() - .into_owned(); - - let result = SearchResult { - title, - description, - url, - engine: SearchEngine::DuckDuckGo, - }; - - let end_position = captures.name("end").unwrap().end(); - self.slice_remaining_block(&end_position); - - return Some(result); - } - None => {} - } - } - - None + self.positions + .handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::DuckDuckGo) } fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) { - let bytes: Vec<u8> = packet.map(|bit| *bit).collect(); - let raw_text = String::from_utf8_lossy(&bytes); - let text = STRIP.replace_all(&raw_text, " "); - - if self.results_started { - self.previous_block.push_str(&text); - } else { - self.results_started = RESULTS_START.is_match(&text); - } + self.positions + .handle_start_check_using_default_method(&RESULTS_START, packet) } } impl DuckDuckGo { - fn slice_remaining_block(&mut self, start_position: &usize) { - let previous_block_bytes = self.previous_block.as_bytes().to_vec(); - let remaining_bytes = previous_block_bytes[*start_position..].to_vec(); - let remaining_text = String::from_utf8(remaining_bytes).unwrap(); - - self.previous_block.clear(); - self.previous_block.push_str(&remaining_text); - } - pub fn new() -> Self { Self { - callback: Box::new(|_: SearchResult| {}), - results_started: false, - previous_block: String::new(), - results: vec![], - completed: false, + positions: EnginePositions::new(), } } - pub fn set_callback(&mut self, callback: CallbackType) { - self.callback = callback; + pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> { + let client = build_default_client(); + let params = [("q", query)]; + let request = client.post(URL).form(¶ms).send(); + + self.handle_request(request, tx).await } } } diff --git a/src/engines/engine_base.rs b/src/engines/engine_base.rs index 792efc0..133729f 100644 --- a/src/engines/engine_base.rs +++ b/src/engines/engine_base.rs @@ -1,5 +1,5 @@ pub mod engine_base { - use std::sync::Arc; + use std::{fmt::Display, sync::Arc}; use futures::{lock::Mutex, Future, StreamExt}; use lazy_static::lazy_static; @@ -18,9 +18,21 @@ pub mod engine_base { #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum SearchEngine { + Brave, + Bing, DuckDuckGo, } + impl Display for SearchEngine { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SearchEngine::Brave => write!(f, "Brave"), + SearchEngine::DuckDuckGo => write!(f, "DuckDuckGo"), + SearchEngine::Bing => write!(f, "Bing"), + } + } + } + #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct SearchResult { pub title: String, @@ -29,15 +41,6 @@ pub mod engine_base { pub engine: SearchEngine, } - /// ResultsCollector collects results across multiple tasks - #[derive(Clone, Debug, Hash, Default)] - pub struct ResultsCollector { - pub started: bool, - pub previous_block: String, - results: Vec<SearchResult>, - current_index: usize, - } - pub trait EngineBase { fn parse_next<'a>(&mut self) -> Option<SearchResult>; @@ -82,45 +85,6 @@ pub mod engine_base { } } - impl ResultsCollector { - pub fn new() -> Self { - Self { - results: Vec::new(), - current_index: 0, - previous_block: String::new(), - started: false, - } - } - - pub fn results(&self) -> &Vec<SearchResult> { - &self.results - } - - pub fn add_result(&mut self, result: SearchResult) { - self.results.push(result); - } - - pub fn get_next_items(&self) -> &[SearchResult] { - if self.current_index >= self.results.len() { - return &[]; - } - - &self.results[self.current_index + 1..self.results.len()] - } - - pub fn update_index(&mut self) { - self.current_index = self.results.len() - 1; - } - - pub fn has_more_results(&self) -> bool { - if self.results.len() == 0 { - return true; - } - - self.current_index < self.results.len() - 1 - } - } - #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct EnginePositions { pub previous_block: String, @@ -163,6 +127,7 @@ pub mod engine_base { pub fn handle_block_using_default_method( &mut self, single_result_regex: &Regex, + engine: SearchEngine, ) -> Option<SearchResult> { if self.started { if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned()) @@ -183,7 +148,7 @@ pub mod engine_base { title, description, url, - engine: SearchEngine::DuckDuckGo, + engine, }; let end_position = capture.get(0).unwrap().end(); diff --git a/src/helpers.rs b/src/helpers.rs index 65572e2..2a708cd 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -2,13 +2,7 @@ // This module differs from utils in the way that everything here // is specifically related the project pub mod helpers { - use std::sync::Arc; - - use bytes::Bytes; - use futures::{lock::Mutex, Future, Stream, StreamExt}; - use reqwest::{Client, ClientBuilder, Error, Response}; - - use crate::engines::engine_base::engine_base::{EngineBase, ResultsCollector}; + use reqwest::{Client, ClientBuilder}; const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.3"; diff --git a/src/main.rs b/src/main.rs index 42e812b..f2a6fe1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,10 @@ +use std::str; use std::sync::Arc; -use std::{str, thread}; +use engines::bing::bing::Bing; use engines::brave::brave::Brave; use engines::duckduckgo::duckduckgo::DuckDuckGo; -use engines::engine_base::engine_base::{ResultsCollector, SearchResult}; +use engines::engine_base::engine_base::SearchResult; use futures::lock::Mutex; use lazy_static::lazy_static; use rocket::response::content::{RawCss, RawHtml}; @@ -50,17 +51,34 @@ fn get_tailwindcss() -> RawCss<&'static str> { #[get("/searchquery?<query>")] async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> { - let query_box = query.to_string(); + let query_brave = query.to_owned().clone(); + let query_duckduckgo = query.to_owned().clone(); + let query_bing = query.to_owned().clone(); let mut first_result_yielded = false; let first_result_start = Instant::now(); let (tx, mut rx) = mpsc::channel::<SearchResult>(16); + let tx_brave = tx.clone(); + let tx_duckduckgo = tx.clone(); + let tx_bing = tx.clone(); tokio::spawn(async move { let mut brave = Brave::new(); - brave.search(&query_box, tx).await; + brave.search(&query_brave, tx_brave).await; + }); + + tokio::spawn(async move { + let mut duckduckgo = DuckDuckGo::new(); + + duckduckgo.search(&query_duckduckgo, tx_duckduckgo).await; + }); + + tokio::spawn(async move { + let mut bing = Bing::new(); + + bing.search(&query_bing, tx_bing).await; }); RawHtml(TextStream! { @@ -74,7 +92,7 @@ async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> { yield format!("<strong>Time taken: {}ms</strong>", diff); } - let text = format!("<li><h1>{}</h1><p>{}</p></li>", &result.title, &result.description); + let text = format!("<li><h1>{}</h1><p>{}</p><i>{}</i></li>", &result.title, &result.description, &result.engine.to_string()); yield text.to_string(); }