feat: Improve bing; Add concurrent search engine

This commit is contained in:
Myzel394 2024-02-21 11:07:19 +01:00
parent c19371d079
commit a230e8b9db
No known key found for this signature in database
GPG Key ID: DEC4AAB876F73185
7 changed files with 117 additions and 173 deletions

View File

@ -1,3 +1,4 @@
pub mod bing;
pub mod brave;
pub mod duckduckgo;
pub mod engine_base;

53
src/engines/bing.rs Normal file
View File

@ -0,0 +1,53 @@
// Search engine parser for Brave Search
// This uses the clearnet, unlocalized version of the search engine.
pub mod bing {
use lazy_static::lazy_static;
use regex::Regex;
use tokio::sync::mpsc::Sender;
use crate::{
engines::engine_base::engine_base::{
EngineBase, EnginePositions, SearchEngine, SearchResult,
},
helpers::helpers::build_default_client,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"id="b_results""#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<li class="b_algo".*?<h2.*?><a href="(?P<url>.+?)".*?>(?P<title>.+?)</a></h2>.*?((<div class="b_caption.*?<p.*?)|(<p class="b_lineclamp3.*?))><span.*?</span>(?P<description>.*?)</p>.*?</li>"#).unwrap();
}
#[derive(Clone, Debug)]
pub struct Bing {
positions: EnginePositions,
}
impl EngineBase for Bing {
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
self.positions
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Bing)
}
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
self.positions
.handle_start_check_using_default_method(&RESULTS_START, packet)
}
}
impl Bing {
pub fn new() -> Self {
Self {
positions: EnginePositions::new(),
}
}
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
let client = build_default_client();
let request = client
.get(format!("https://www.bing.com/search?q={}", query))
.send();
self.handle_request(request, tx).await
}
}
}

View File

@ -6,14 +6,15 @@ pub mod brave {
use tokio::sync::mpsc::Sender;
use crate::{
engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
engines::engine_base::engine_base::{
EngineBase, EnginePositions, SearchEngine, SearchResult,
},
helpers::helpers::build_default_client,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
}
#[derive(Clone, Debug)]
@ -24,7 +25,7 @@ pub mod brave {
impl EngineBase for Brave {
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
self.positions
.handle_block_using_default_method(&SINGLE_RESULT)
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Brave)
}
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {

View File

@ -1,141 +1,53 @@
// Search engine parser for DuckDuckGo
// Search engine parser for DuckDuckGo Search
pub mod duckduckgo {
use lazy_static::lazy_static;
use regex::Regex;
use urlencoding::decode;
use tokio::sync::mpsc::Sender;
use crate::{
engines::engine_base::engine_base::{EngineBase, SearchEngine, SearchResult},
utils::utils::decode_html_text,
engines::engine_base::engine_base::{
EngineBase, EnginePositions, SearchEngine, SearchResult,
},
helpers::helpers::build_default_client,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result results_links.*?<a.*?href="(?P<url>.*?)".*?>(?P<title>.*?)</a>.*?class="result__snippet".*?>(?P<description>.*?)</a>.*?class="clear".*?</div>(?P<end> </div>){2}"#).unwrap();
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
}
pub type CallbackType = Box<dyn FnMut(SearchResult) -> () + Send + Sync>;
const URL: &str = "https://html.duckduckgo.com/html";
#[derive(Clone, Debug)]
pub struct DuckDuckGo {
callback: CallbackType,
pub completed: bool,
results_started: bool,
pub previous_block: String,
// Holds all results until consumed by iterator
pub results: Vec<SearchResult>,
positions: EnginePositions,
}
// impl Stream for DuckDuckGo {
// type Item = String;
//
// fn poll_next(
// self: Pin<&mut Self>,
// cx: &mut Context<'_>,
// ) -> std::task::Poll<Option<Self::Item>> {
// if self.results.len() > 0 {
// let result = &mut self.results.pop_front().unwrap();
//
// let html = format!("<br><h2>{}</h2><p>{}</p>", result.title, result.description);
//
// return Poll::Ready(Some(html));
// }
//
// if self.completed {
// return Poll::Ready(None);
// }
//
// Poll::Pending
// }
// }
// impl Iterator for DuckDuckGo {
// type Item = SearchResult;
//
// fn next(&mut self) -> Option<SearchResult> {
// if self.results.len() > 0 {
// let oldest = self.results.pop_front().unwrap();
//
// Some(oldest)
// } else {
// None
// }
// }
// }
impl EngineBase for DuckDuckGo {
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
if self.results_started {
match SINGLE_RESULT.captures(&self.previous_block.to_owned()) {
Some(captures) => {
let title = decode(captures.name("title").unwrap().as_str())
.unwrap()
.into_owned();
let description_raw =
decode_html_text(captures.name("description").unwrap().as_str())
.unwrap();
let description = STRIP_HTML_TAGS
.replace_all(&description_raw, "")
.into_owned();
let url = decode(captures.name("url").unwrap().as_str())
.unwrap()
.into_owned();
let result = SearchResult {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
};
let end_position = captures.name("end").unwrap().end();
self.slice_remaining_block(&end_position);
return Some(result);
}
None => {}
}
}
None
self.positions
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::DuckDuckGo)
}
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
let raw_text = String::from_utf8_lossy(&bytes);
let text = STRIP.replace_all(&raw_text, " ");
if self.results_started {
self.previous_block.push_str(&text);
} else {
self.results_started = RESULTS_START.is_match(&text);
}
self.positions
.handle_start_check_using_default_method(&RESULTS_START, packet)
}
}
impl DuckDuckGo {
fn slice_remaining_block(&mut self, start_position: &usize) {
let previous_block_bytes = self.previous_block.as_bytes().to_vec();
let remaining_bytes = previous_block_bytes[*start_position..].to_vec();
let remaining_text = String::from_utf8(remaining_bytes).unwrap();
self.previous_block.clear();
self.previous_block.push_str(&remaining_text);
}
pub fn new() -> Self {
Self {
callback: Box::new(|_: SearchResult| {}),
results_started: false,
previous_block: String::new(),
results: vec![],
completed: false,
positions: EnginePositions::new(),
}
}
pub fn set_callback(&mut self, callback: CallbackType) {
self.callback = callback;
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
let client = build_default_client();
let params = [("q", query)];
let request = client.post(URL).form(&params).send();
self.handle_request(request, tx).await
}
}
}

View File

@ -1,5 +1,5 @@
pub mod engine_base {
use std::sync::Arc;
use std::{fmt::Display, sync::Arc};
use futures::{lock::Mutex, Future, StreamExt};
use lazy_static::lazy_static;
@ -18,9 +18,21 @@ pub mod engine_base {
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub enum SearchEngine {
Brave,
Bing,
DuckDuckGo,
}
impl Display for SearchEngine {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
SearchEngine::Brave => write!(f, "Brave"),
SearchEngine::DuckDuckGo => write!(f, "DuckDuckGo"),
SearchEngine::Bing => write!(f, "Bing"),
}
}
}
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct SearchResult {
pub title: String,
@ -29,15 +41,6 @@ pub mod engine_base {
pub engine: SearchEngine,
}
/// ResultsCollector collects results across multiple tasks
#[derive(Clone, Debug, Hash, Default)]
pub struct ResultsCollector {
pub started: bool,
pub previous_block: String,
results: Vec<SearchResult>,
current_index: usize,
}
pub trait EngineBase {
fn parse_next<'a>(&mut self) -> Option<SearchResult>;
@ -82,45 +85,6 @@ pub mod engine_base {
}
}
impl ResultsCollector {
pub fn new() -> Self {
Self {
results: Vec::new(),
current_index: 0,
previous_block: String::new(),
started: false,
}
}
pub fn results(&self) -> &Vec<SearchResult> {
&self.results
}
pub fn add_result(&mut self, result: SearchResult) {
self.results.push(result);
}
pub fn get_next_items(&self) -> &[SearchResult] {
if self.current_index >= self.results.len() {
return &[];
}
&self.results[self.current_index + 1..self.results.len()]
}
pub fn update_index(&mut self) {
self.current_index = self.results.len() - 1;
}
pub fn has_more_results(&self) -> bool {
if self.results.len() == 0 {
return true;
}
self.current_index < self.results.len() - 1
}
}
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub struct EnginePositions {
pub previous_block: String,
@ -163,6 +127,7 @@ pub mod engine_base {
pub fn handle_block_using_default_method(
&mut self,
single_result_regex: &Regex,
engine: SearchEngine,
) -> Option<SearchResult> {
if self.started {
if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned())
@ -183,7 +148,7 @@ pub mod engine_base {
title,
description,
url,
engine: SearchEngine::DuckDuckGo,
engine,
};
let end_position = capture.get(0).unwrap().end();

View File

@ -2,13 +2,7 @@
// This module differs from utils in the way that everything here
// is specifically related the project
pub mod helpers {
use std::sync::Arc;
use bytes::Bytes;
use futures::{lock::Mutex, Future, Stream, StreamExt};
use reqwest::{Client, ClientBuilder, Error, Response};
use crate::engines::engine_base::engine_base::{EngineBase, ResultsCollector};
use reqwest::{Client, ClientBuilder};
const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.3";

View File

@ -1,9 +1,10 @@
use std::str;
use std::sync::Arc;
use std::{str, thread};
use engines::bing::bing::Bing;
use engines::brave::brave::Brave;
use engines::duckduckgo::duckduckgo::DuckDuckGo;
use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
use engines::engine_base::engine_base::SearchResult;
use futures::lock::Mutex;
use lazy_static::lazy_static;
use rocket::response::content::{RawCss, RawHtml};
@ -50,17 +51,34 @@ fn get_tailwindcss() -> RawCss<&'static str> {
#[get("/searchquery?<query>")]
async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> {
let query_box = query.to_string();
let query_brave = query.to_owned().clone();
let query_duckduckgo = query.to_owned().clone();
let query_bing = query.to_owned().clone();
let mut first_result_yielded = false;
let first_result_start = Instant::now();
let (tx, mut rx) = mpsc::channel::<SearchResult>(16);
let tx_brave = tx.clone();
let tx_duckduckgo = tx.clone();
let tx_bing = tx.clone();
tokio::spawn(async move {
let mut brave = Brave::new();
brave.search(&query_box, tx).await;
brave.search(&query_brave, tx_brave).await;
});
tokio::spawn(async move {
let mut duckduckgo = DuckDuckGo::new();
duckduckgo.search(&query_duckduckgo, tx_duckduckgo).await;
});
tokio::spawn(async move {
let mut bing = Bing::new();
bing.search(&query_bing, tx_bing).await;
});
RawHtml(TextStream! {
@ -74,7 +92,7 @@ async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> {
yield format!("<strong>Time taken: {}ms</strong>", diff);
}
let text = format!("<li><h1>{}</h1><p>{}</p></li>", &result.title, &result.description);
let text = format!("<li><h1>{}</h1><p>{}</p><i>{}</i></li>", &result.title, &result.description, &result.engine.to_string());
yield text.to_string();
}