From a230e8b9db9abc2153d6efad93c2116214fc547c Mon Sep 17 00:00:00 2001
From: Myzel394 <50424412+Myzel394@users.noreply.github.com>
Date: Wed, 21 Feb 2024 11:07:19 +0100
Subject: [PATCH] feat: Improve bing; Add concurrent search engine
---
src/engines.rs | 1 +
src/engines/bing.rs | 53 +++++++++++++++
src/engines/brave.rs | 7 +-
src/engines/duckduckgo.rs | 128 ++++++-------------------------------
src/engines/engine_base.rs | 65 +++++--------------
src/helpers.rs | 8 +--
src/main.rs | 28 ++++++--
7 files changed, 117 insertions(+), 173 deletions(-)
create mode 100644 src/engines/bing.rs
diff --git a/src/engines.rs b/src/engines.rs
index 9311cee..cb584d1 100644
--- a/src/engines.rs
+++ b/src/engines.rs
@@ -1,3 +1,4 @@
+pub mod bing;
pub mod brave;
pub mod duckduckgo;
pub mod engine_base;
diff --git a/src/engines/bing.rs b/src/engines/bing.rs
new file mode 100644
index 0000000..7f73f23
--- /dev/null
+++ b/src/engines/bing.rs
@@ -0,0 +1,53 @@
+// Search engine parser for Brave Search
+// This uses the clearnet, unlocalized version of the search engine.
+pub mod bing {
+ use lazy_static::lazy_static;
+ use regex::Regex;
+ use tokio::sync::mpsc::Sender;
+
+ use crate::{
+ engines::engine_base::engine_base::{
+ EngineBase, EnginePositions, SearchEngine, SearchResult,
+ },
+ helpers::helpers::build_default_client,
+ };
+
+ lazy_static! {
+ static ref RESULTS_START: Regex = Regex::new(r#"id="b_results""#).unwrap();
+ static ref SINGLE_RESULT: Regex = Regex::new(r#"
(?P.+?).*?(((?P.*?).*?"#).unwrap();
+ }
+
+ #[derive(Clone, Debug)]
+ pub struct Bing {
+ positions: EnginePositions,
+ }
+
+ impl EngineBase for Bing {
+ fn parse_next<'a>(&mut self) -> Option {
+ self.positions
+ .handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Bing)
+ }
+
+ fn push_packet<'a>(&mut self, packet: impl Iterator- ) {
+ self.positions
+ .handle_start_check_using_default_method(&RESULTS_START, packet)
+ }
+ }
+
+ impl Bing {
+ pub fn new() -> Self {
+ Self {
+ positions: EnginePositions::new(),
+ }
+ }
+
+ pub async fn search(&mut self, query: &str, tx: Sender) -> Result<(), ()> {
+ let client = build_default_client();
+ let request = client
+ .get(format!("https://www.bing.com/search?q={}", query))
+ .send();
+
+ self.handle_request(request, tx).await
+ }
+ }
+}
diff --git a/src/engines/brave.rs b/src/engines/brave.rs
index cb594eb..ac72a35 100644
--- a/src/engines/brave.rs
+++ b/src/engines/brave.rs
@@ -6,14 +6,15 @@ pub mod brave {
use tokio::sync::mpsc::Sender;
use crate::{
- engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
+ engines::engine_base::engine_base::{
+ EngineBase, EnginePositions, SearchEngine, SearchResult,
+ },
helpers::helpers::build_default_client,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"(?P.+?)
.+?(?P.+?)
"#).unwrap();
- static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
}
#[derive(Clone, Debug)]
@@ -24,7 +25,7 @@ pub mod brave {
impl EngineBase for Brave {
fn parse_next<'a>(&mut self) -> Option {
self.positions
- .handle_block_using_default_method(&SINGLE_RESULT)
+ .handle_block_using_default_method(&SINGLE_RESULT, SearchEngine::Brave)
}
fn push_packet<'a>(&mut self, packet: impl Iterator- ) {
diff --git a/src/engines/duckduckgo.rs b/src/engines/duckduckgo.rs
index fbe8866..b0efac4 100644
--- a/src/engines/duckduckgo.rs
+++ b/src/engines/duckduckgo.rs
@@ -1,141 +1,53 @@
-// Search engine parser for DuckDuckGo
+// Search engine parser for DuckDuckGo Search
pub mod duckduckgo {
use lazy_static::lazy_static;
use regex::Regex;
- use urlencoding::decode;
+ use tokio::sync::mpsc::Sender;
use crate::{
- engines::engine_base::engine_base::{EngineBase, SearchEngine, SearchResult},
- utils::utils::decode_html_text,
+ engines::engine_base::engine_base::{
+ EngineBase, EnginePositions, SearchEngine, SearchResult,
+ },
+ helpers::helpers::build_default_client,
};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"
.*?)".*?>(?P
.*?).*?class="result__snippet".*?>(?P.*?).*?class="clear".*?(?P ){2}"#).unwrap();
- static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
- static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
}
- pub type CallbackType = Box () + Send + Sync>;
+ const URL: &str = "https://html.duckduckgo.com/html";
+ #[derive(Clone, Debug)]
pub struct DuckDuckGo {
- callback: CallbackType,
- pub completed: bool,
- results_started: bool,
- pub previous_block: String,
- // Holds all results until consumed by iterator
- pub results: Vec,
+ positions: EnginePositions,
}
- // impl Stream for DuckDuckGo {
- // type Item = String;
- //
- // fn poll_next(
- // self: Pin<&mut Self>,
- // cx: &mut Context<'_>,
- // ) -> std::task::Poll