mirror of
https://github.com/Myzel394/tifsep.git
synced 2025-06-18 15:35:26 +02:00
fix: cleanup; improve code quality
This commit is contained in:
parent
5d076328ec
commit
c19371d079
@ -1,27 +1,19 @@
|
|||||||
// Search engine parser for Brave Search
|
// Search engine parser for Brave Search
|
||||||
// This uses the clearnet, unlocalized version of the search engine.
|
// This uses the clearnet, unlocalized version of the search engine.
|
||||||
pub mod brave {
|
pub mod brave {
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use futures::lock::Mutex;
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
use urlencoding::decode;
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
engines::engine_base::engine_base::{
|
engines::engine_base::engine_base::{EngineBase, EnginePositions, SearchResult},
|
||||||
EngineBase, EnginePositions, ResultsCollector, SearchEngine, SearchResult,
|
|
||||||
},
|
|
||||||
helpers::helpers::build_default_client,
|
helpers::helpers::build_default_client,
|
||||||
utils::utils::decode_html_text,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
|
static ref RESULTS_START: Regex = Regex::new(r#"<body"#).unwrap();
|
||||||
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
|
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="snippet svelte-.+?<a href=.(?P<url>.+?)".+?<div class="title svelte-.+?">(?P<title>.+?)</div></div>.+?<div class="snippet-description.+?">(?P<description>.+?)</div></div>"#).unwrap();
|
||||||
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
||||||
static ref STRIP_HTML_TAGS: Regex = Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
@ -31,49 +23,13 @@ pub mod brave {
|
|||||||
|
|
||||||
impl EngineBase for Brave {
|
impl EngineBase for Brave {
|
||||||
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
|
fn parse_next<'a>(&mut self) -> Option<SearchResult> {
|
||||||
if self.positions.started {
|
self.positions
|
||||||
if let Some(capture) =
|
.handle_block_using_default_method(&SINGLE_RESULT)
|
||||||
SINGLE_RESULT.captures(&self.positions.previous_block.to_owned())
|
|
||||||
{
|
|
||||||
let title = decode(capture.name("title").unwrap().as_str())
|
|
||||||
.unwrap()
|
|
||||||
.into_owned();
|
|
||||||
let description_raw =
|
|
||||||
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
|
|
||||||
let description = STRIP_HTML_TAGS
|
|
||||||
.replace_all(&description_raw, "")
|
|
||||||
.into_owned();
|
|
||||||
let url = decode(capture.name("url").unwrap().as_str())
|
|
||||||
.unwrap()
|
|
||||||
.into_owned();
|
|
||||||
|
|
||||||
let result = SearchResult {
|
|
||||||
title,
|
|
||||||
description,
|
|
||||||
url,
|
|
||||||
engine: SearchEngine::DuckDuckGo,
|
|
||||||
};
|
|
||||||
|
|
||||||
let end_position = capture.get(0).unwrap().end();
|
|
||||||
self.positions.slice_remaining_block(&end_position);
|
|
||||||
|
|
||||||
return Some(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
fn push_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||||
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
|
self.positions
|
||||||
let raw_text = String::from_utf8_lossy(&bytes);
|
.handle_start_check_using_default_method(&RESULTS_START, packet)
|
||||||
let text = STRIP.replace_all(&raw_text, " ");
|
|
||||||
|
|
||||||
if self.positions.started {
|
|
||||||
self.positions.previous_block.push_str(&text);
|
|
||||||
} else {
|
|
||||||
self.positions.started = RESULTS_START.is_match(&text);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,13 +40,13 @@ pub mod brave {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) {
|
pub async fn search(&mut self, query: &str, tx: Sender<SearchResult>) -> Result<(), ()> {
|
||||||
let client = build_default_client();
|
let client = build_default_client();
|
||||||
let request = client
|
let request = client
|
||||||
.get(format!("https://search.brave.com/search?q={}", query))
|
.get(format!("https://search.brave.com/search?q={}", query))
|
||||||
.send();
|
.send();
|
||||||
|
|
||||||
self.handle_request(request, tx).await;
|
self.handle_request(request, tx).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,17 +6,22 @@ pub mod engine_base {
|
|||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::{Error, Response};
|
use reqwest::{Error, Response};
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
|
use urlencoding::decode;
|
||||||
|
|
||||||
|
use crate::utils::utils::decode_html_text;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
static ref STRIP: Regex = Regex::new(r"[\s\n]+").unwrap();
|
||||||
|
static ref STRIP_HTML_TAGS: Regex =
|
||||||
|
Regex::new(r#"<(?:"[^"]*"['"]*|'[^']*'['"]*|[^'">])+>"#).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, Hash)]
|
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub enum SearchEngine {
|
pub enum SearchEngine {
|
||||||
DuckDuckGo,
|
DuckDuckGo,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Hash)]
|
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
pub title: String,
|
pub title: String,
|
||||||
pub url: String,
|
pub url: String,
|
||||||
@ -52,7 +57,7 @@ pub mod engine_base {
|
|||||||
&mut self,
|
&mut self,
|
||||||
request: impl Future<Output = Result<Response, Error>>,
|
request: impl Future<Output = Result<Response, Error>>,
|
||||||
tx: Sender<SearchResult>,
|
tx: Sender<SearchResult>,
|
||||||
) {
|
) -> Result<(), ()> {
|
||||||
let mut stream = request.await.unwrap().bytes_stream();
|
let mut stream = request.await.unwrap().bytes_stream();
|
||||||
|
|
||||||
while let Some(chunk) = stream.next().await {
|
while let Some(chunk) = stream.next().await {
|
||||||
@ -61,13 +66,19 @@ pub mod engine_base {
|
|||||||
self.push_packet(buffer.iter());
|
self.push_packet(buffer.iter());
|
||||||
|
|
||||||
while let Some(result) = self.parse_next() {
|
while let Some(result) = self.parse_next() {
|
||||||
tx.send(result).await;
|
if tx.send(result).await.is_err() {
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while let Some(result) = self.parse_next() {
|
while let Some(result) = self.parse_next() {
|
||||||
tx.send(result).await;
|
if tx.send(result).await.is_err() {
|
||||||
|
return Err(());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,5 +143,57 @@ pub mod engine_base {
|
|||||||
self.previous_block.clear();
|
self.previous_block.clear();
|
||||||
self.previous_block.push_str(&remaining_text);
|
self.previous_block.push_str(&remaining_text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn handle_start_check_using_default_method<'a>(
|
||||||
|
&mut self,
|
||||||
|
results_start_regex: &Regex,
|
||||||
|
packet: impl Iterator<Item = &'a u8>,
|
||||||
|
) {
|
||||||
|
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
|
||||||
|
let raw_text = String::from_utf8_lossy(&bytes);
|
||||||
|
let text = STRIP.replace_all(&raw_text, " ");
|
||||||
|
|
||||||
|
if self.started {
|
||||||
|
self.previous_block.push_str(&text);
|
||||||
|
} else {
|
||||||
|
self.started = results_start_regex.is_match(&text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn handle_block_using_default_method(
|
||||||
|
&mut self,
|
||||||
|
single_result_regex: &Regex,
|
||||||
|
) -> Option<SearchResult> {
|
||||||
|
if self.started {
|
||||||
|
if let Some(capture) = single_result_regex.captures(&self.previous_block.to_owned())
|
||||||
|
{
|
||||||
|
let title = decode(capture.name("title").unwrap().as_str())
|
||||||
|
.unwrap()
|
||||||
|
.into_owned();
|
||||||
|
let description_raw =
|
||||||
|
decode_html_text(capture.name("description").unwrap().as_str()).unwrap();
|
||||||
|
let description = STRIP_HTML_TAGS
|
||||||
|
.replace_all(&description_raw, "")
|
||||||
|
.into_owned();
|
||||||
|
let url = decode(capture.name("url").unwrap().as_str())
|
||||||
|
.unwrap()
|
||||||
|
.into_owned();
|
||||||
|
|
||||||
|
let result = SearchResult {
|
||||||
|
title,
|
||||||
|
description,
|
||||||
|
url,
|
||||||
|
engine: SearchEngine::DuckDuckGo,
|
||||||
|
};
|
||||||
|
|
||||||
|
let end_position = capture.get(0).unwrap().end();
|
||||||
|
self.slice_remaining_block(&end_position);
|
||||||
|
|
||||||
|
return Some(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ use std::sync::Arc;
|
|||||||
use std::{str, thread};
|
use std::{str, thread};
|
||||||
|
|
||||||
use engines::brave::brave::Brave;
|
use engines::brave::brave::Brave;
|
||||||
|
use engines::duckduckgo::duckduckgo::DuckDuckGo;
|
||||||
use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
|
use engines::engine_base::engine_base::{ResultsCollector, SearchResult};
|
||||||
use futures::lock::Mutex;
|
use futures::lock::Mutex;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user