mirror of
https://github.com/Myzel394/tifsep.git
synced 2025-06-18 15:35:26 +02:00
feat: Improve bing; Add concurrent search engine
This commit is contained in:
parent
c19371d079
commit
a230e8b9db
@ -1,3 +1,4 @@
|
||||
pub mod bing;
|
||||
pub mod brave;
|
||||
pub mod duckduckgo;
|
||||
pub mod engine_base;
|
||||
|
53
src/engines/bing.rs
Normal file
53
src/engines/bing.rs
Normal file
@ -0,0 +1,53 @@
|
||||
// Search engine parser for Brave Search
|
||||
// This uses the clearnet, unlocalized version of the search engine.
|
||||
pub mod bing {
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::{
|
||||
engines::engine_base::engine_base::{
|
||||
EngineBase, EnginePositions, SearchEngine, SearchResult,
|
||||
},
|
||||
helpers::helpers::build_default_client,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref RESULTS_START: Regex = Regex::new(r#"id="b_results""#).unwrap();
|
||||
static ref SINGLE_RESULT: Regex = Regex::new(r#"<li class="b_algo".*?<h2.*?><a href="(?P<url>.+?)".*?>(?P<title>.+?)</a></h2>.*?((<div class="b_caption.*?<p.*?)|(<p class="b_lineclamp3.*?))><span.*?</span>(?P<description>.*?)</p>.*?</li>"#).unwrap();
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Bing {
|
||||
positions: EnginePositions,
|
||||
}
|
||||
|
||||
impl EngineBase for Bing {
|
||||
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
|
||||
self.positions
|
||||
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Bing)
|
||||
}
|
||||
|
||||
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||
self.positions
|
||||
.handle_start_check_using_default_method(&RESULTS_START, packet)
|
||||
}
|
||||
}
|
||||
|
||||
impl Bing {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
positions: EnginePositions::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
|
||||
let client = build_default_client();
|
||||
let request = client
|
||||
.get(format!("https://www.bing.com/search?q={}", query))
|
||||
.send();
|
||||
|
||||
self.handle_request(request, tx).await
|
||||
}
|
||||
}
|
||||
}
|
@ -6,14 +6,15 @@ pub mod brave {
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::{
|
||||
engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
|
||||
engines::engine_base::engine_base::{
|
||||
EngineBase, EnginePositions, SearchEngine, SearchResult,
|
||||
},
|
||||
helpers::helpers::build_default_client,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
|
||||
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
|
||||
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@ -24,7 +25,7 @@ pub mod brave {
|
||||
impl EngineBase for Brave {
|
||||
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
|
||||
self.positions
|
||||
.handle_block_using_default_method(&SINGLE_RESULT)
|
||||
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Brave)
|
||||
}
|
||||
|
||||
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||
|
@ -1,141 +1,53 @@
|
||||
// Search engine parser for DuckDuckGo
|
||||
// Search engine parser for DuckDuckGo Search
|
||||
pub mod duckduckgo {
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use urlencoding::decode;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
use crate::{
|
||||
engines::engine_base::engine_base::{EngineBase, SearchEngine, SearchResult},
|
||||
utils::utils::decode_html_text,
|
||||
engines::engine_base::engine_base::{
|
||||
EngineBase, EnginePositions, SearchEngine, SearchResult,
|
||||
},
|
||||
helpers::helpers::build_default_client,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
|
||||
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result results_links.*?<a.*?href="(?P<url>.*?)".*?>(?P<title>.*?)</a>.*?class="result__snippet".*?>(?P<description>.*?)</a>.*?class="clear".*?</div>(?P<end> </div>){2}"#).unwrap();
|
||||
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
||||
static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
|
||||
}
|
||||
|
||||
pub type CallbackType = Box<dyn FnMut(SearchResult) -> () + Send + Sync>;
|
||||
const URL: &str = "https://html.duckduckgo.com/html";
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DuckDuckGo {
|
||||
callback: CallbackType,
|
||||
pub completed: bool,
|
||||
results_started: bool,
|
||||
pub previous_block: String,
|
||||
// Holds all results until consumed by iterator
|
||||
pub results: Vec<SearchResult>,
|
||||
positions: EnginePositions,
|
||||
}
|
||||
|
||||
// impl Stream for DuckDuckGo {
|
||||
// type Item = String;
|
||||
//
|
||||
// fn poll_next(
|
||||
// self: Pin<&mut Self>,
|
||||
// cx: &mut Context<'_>,
|
||||
// ) -> std::task::Poll<Option<Self::Item>> {
|
||||
// if self.results.len() > 0 {
|
||||
// let result = &mut self.results.pop_front().unwrap();
|
||||
//
|
||||
// let html = format!("<br><h2>{}</h2><p>{}</p>", result.title, result.description);
|
||||
//
|
||||
// return Poll::Ready(Some(html));
|
||||
// }
|
||||
//
|
||||
// if self.completed {
|
||||
// return Poll::Ready(None);
|
||||
// }
|
||||
//
|
||||
// Poll::Pending
|
||||
// }
|
||||
// }
|
||||
|
||||
// impl Iterator for DuckDuckGo {
|
||||
// type Item = SearchResult;
|
||||
//
|
||||
// fn next(&mut self) -> Option<SearchResult> {
|
||||
// if self.results.len() > 0 {
|
||||
// let oldest = self.results.pop_front().unwrap();
|
||||
//
|
||||
// Some(oldest)
|
||||
// } else {
|
||||
// None
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
impl EngineBase for DuckDuckGo {
|
||||
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
|
||||
if self.results_started {
|
||||
match SINGLE_RESULT.captures(&self.previous_block.to_owned()) {
|
||||
Some(captures) => {
|
||||
let title = decode(captures.name("title").unwrap().as_str())
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
let description_raw =
|
||||
decode_html_text(captures.name("description").unwrap().as_str())
|
||||
.unwrap();
|
||||
let description = STRIP_HTML_TAGS
|
||||
.replace_all(&description_raw, "")
|
||||
.into_owned();
|
||||
let url = decode(captures.name("url").unwrap().as_str())
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
|
||||
let result = SearchResult {
|
||||
title,
|
||||
description,
|
||||
url,
|
||||
engine: SearchEngine::DuckDuckGo,
|
||||
};
|
||||
|
||||
let end_position = captures.name("end").unwrap().end();
|
||||
self.slice_remaining_block(&end_position);
|
||||
|
||||
return Some(result);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
self.positions
|
||||
.handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::DuckDuckGo)
|
||||
}
|
||||
|
||||
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
|
||||
let raw_text = String::from_utf8_lossy(&bytes);
|
||||
let text = STRIP.replace_all(&raw_text, " ");
|
||||
|
||||
if self.results_started {
|
||||
self.previous_block.push_str(&text);
|
||||
} else {
|
||||
self.results_started = RESULTS_START.is_match(&text);
|
||||
}
|
||||
self.positions
|
||||
.handle_start_check_using_default_method(&RESULTS_START, packet)
|
||||
}
|
||||
}
|
||||
|
||||
impl DuckDuckGo {
|
||||
fn slice_remaining_block(&mut self, start_position: &usize) {
|
||||
let previous_block_bytes = self.previous_block.as_bytes().to_vec();
|
||||
let remaining_bytes = previous_block_bytes[*start_position..].to_vec();
|
||||
let remaining_text = String::from_utf8(remaining_bytes).unwrap();
|
||||
|
||||
self.previous_block.clear();
|
||||
self.previous_block.push_str(&remaining_text);
|
||||
}
|
||||
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
callback: Box::new(|_: SearchResult| {}),
|
||||
results_started: false,
|
||||
previous_block: String::new(),
|
||||
results: vec![],
|
||||
completed: false,
|
||||
positions: EnginePositions::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_callback(&mut self, callback: CallbackType) {
|
||||
self.callback = callback;
|
||||
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
|
||||
let client = build_default_client();
|
||||
let params = [("q", query)];
|
||||
let request = client.post(URL).form(¶ms).send();
|
||||
|
||||
self.handle_request(request, tx).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
pub mod engine_base {
|
||||
use std::sync::Arc;
|
||||
use std::{fmt::Display, sync::Arc};
|
||||
|
||||
use futures::{lock::Mutex, Future, StreamExt};
|
||||
use lazy_static::lazy_static;
|
||||
@ -18,9 +18,21 @@ pub mod engine_base {
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum SearchEngine {
|
||||
Brave,
|
||||
Bing,
|
||||
DuckDuckGo,
|
||||
}
|
||||
|
||||
impl Display for SearchEngine {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
SearchEngine::Brave => write!(f, "Brave"),
|
||||
SearchEngine::DuckDuckGo => write!(f, "DuckDuckGo"),
|
||||
SearchEngine::Bing => write!(f, "Bing"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SearchResult {
|
||||
pub title: String,
|
||||
@ -29,15 +41,6 @@ pub mod engine_base {
|
||||
pub engine: SearchEngine,
|
||||
}
|
||||
|
||||
/// ResultsCollector collects results across multiple tasks
|
||||
#[derive(Clone, Debug, Hash, Default)]
|
||||
pub struct ResultsCollector {
|
||||
pub started: bool,
|
||||
pub previous_block: String,
|
||||
results: Vec<SearchResult>,
|
||||
current_index: usize,
|
||||
}
|
||||
|
||||
pub trait EngineBase {
|
||||
fn parse_next<'a>(&mut self) -> Option<SearchResult>;
|
||||
|
||||
@ -82,45 +85,6 @@ pub mod engine_base {
|
||||
}
|
||||
}
|
||||
|
||||
impl ResultsCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
results: Vec::new(),
|
||||
current_index: 0,
|
||||
previous_block: String::new(),
|
||||
started: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn results(&self) -> &Vec<SearchResult> {
|
||||
&self.results
|
||||
}
|
||||
|
||||
pub fn add_result(&mut self, result: SearchResult) {
|
||||
self.results.push(result);
|
||||
}
|
||||
|
||||
pub fn get_next_items(&self) -> &[SearchResult] {
|
||||
if self.current_index >= self.results.len() {
|
||||
return &[];
|
||||
}
|
||||
|
||||
&self.results[self.current_index + 1..self.results.len()]
|
||||
}
|
||||
|
||||
pub fn update_index(&mut self) {
|
||||
self.current_index = self.results.len() - 1;
|
||||
}
|
||||
|
||||
pub fn has_more_results(&self) -> bool {
|
||||
if self.results.len() == 0 {
|
||||
return true;
|
||||
}
|
||||
|
||||
self.current_index < self.results.len() - 1
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct EnginePositions {
|
||||
pub previous_block: String,
|
||||
@ -163,6 +127,7 @@ pub mod engine_base {
|
||||
pub fn handle_block_using_default_method(
|
||||
&mut self,
|
||||
single_result_regex: &Regex,
|
||||
engine: SearchEngine,
|
||||
) -> Option<SearchResult> {
|
||||
if self.started {
|
||||
if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned())
|
||||
@ -183,7 +148,7 @@ pub mod engine_base {
|
||||
title,
|
||||
description,
|
||||
url,
|
||||
engine: SearchEngine::DuckDuckGo,
|
||||
engine,
|
||||
};
|
||||
|
||||
let end_position = capture.get(0).unwrap().end();
|
||||
|
@ -2,13 +2,7 @@
|
||||
// This module differs from utils in the way that everything here
|
||||
// is specifically related the project
|
||||
pub mod helpers {
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::{lock::Mutex, Future, Stream, StreamExt};
|
||||
use reqwest::{Client, ClientBuilder, Error, Response};
|
||||
|
||||
use crate::engines::engine_base::engine_base::{EngineBase, ResultsCollector};
|
||||
use reqwest::{Client, ClientBuilder};
|
||||
|
||||
const DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.3";
|
||||
|
||||
|
28
src/main.rs
28
src/main.rs
@ -1,9 +1,10 @@
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::{str, thread};
|
||||
|
||||
use engines::bing::bing::Bing;
|
||||
use engines::brave::brave::Brave;
|
||||
use engines::duckduckgo::duckduckgo::DuckDuckGo;
|
||||
use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
|
||||
use engines::engine_base::engine_base::SearchResult;
|
||||
use futures::lock::Mutex;
|
||||
use lazy_static::lazy_static;
|
||||
use rocket::response::content::{RawCss, RawHtml};
|
||||
@ -50,17 +51,34 @@ fn get_tailwindcss() -> RawCss<&'static str> {
|
||||
|
||||
#[get("/searchquery?<query>")]
|
||||
async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> {
|
||||
let query_box = query.to_string();
|
||||
let query_brave = query.to_owned().clone();
|
||||
let query_duckduckgo = query.to_owned().clone();
|
||||
let query_bing = query.to_owned().clone();
|
||||
|
||||
let mut first_result_yielded = false;
|
||||
let first_result_start = Instant::now();
|
||||
|
||||
let (tx, mut rx) = mpsc::channel::<SearchResult>(16);
|
||||
let tx_brave = tx.clone();
|
||||
let tx_duckduckgo = tx.clone();
|
||||
let tx_bing = tx.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut brave = Brave::new();
|
||||
|
||||
brave.search(&query_box, tx).await;
|
||||
brave.search(&query_brave, tx_brave).await;
|
||||
});
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut duckduckgo = DuckDuckGo::new();
|
||||
|
||||
duckduckgo.search(&query_duckduckgo, tx_duckduckgo).await;
|
||||
});
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut bing = Bing::new();
|
||||
|
||||
bing.search(&query_bing, tx_bing).await;
|
||||
});
|
||||
|
||||
RawHtml(TextStream! {
|
||||
@ -74,7 +92,7 @@ async fn hello<'a>(query: &str) -> RawHtml<TextStream![String]> {
|
||||
yield format!("<strong>Time taken: {}ms</strong>", diff);
|
||||
}
|
||||
|
||||
let text = format!("<li><h1>{}</h1><p>{}</p></li>", &result.title, &result.description);
|
||||
let text = format!("<li><h1>{}</h1><p>{}</p><i>{}</i></li>", &result.title, &result.description, &result.engine.to_string());
|
||||
|
||||
yield text.to_string();
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user