feat: Add first wip for parsers

This commit is contained in:
Myzel394 2024-01-28 20:45:02 +01:00
parent d65f4365f0
commit d2e9d75b0e
No known key found for this signature in database
GPG Key ID: 79CC92F37B3E1A2B
7 changed files with 175 additions and 12 deletions

40
Cargo.lock generated
View File

@ -17,6 +17,15 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "autocfg" name = "autocfg"
version = "1.1.0" version = "1.1.0"
@ -537,6 +546,35 @@ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
] ]
[[package]]
name = "regex"
version = "1.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]] [[package]]
name = "reqwest" name = "reqwest"
version = "0.11.23" version = "0.11.23"
@ -787,6 +825,8 @@ dependencies = [
name = "tcp_test" name = "tcp_test"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"lazy_static",
"regex",
"reqwest", "reqwest",
"rustls", "rustls",
"tokio", "tokio",

View File

@ -6,6 +6,8 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
lazy_static = "1.4.0"
regex = "1.10.3"
reqwest = "0.11.23" reqwest = "0.11.23"
rustls = "0.22.2" rustls = "0.22.2"
tokio = "1.35.1" tokio = "1.35.1"

View File

@ -58,7 +58,16 @@ pub mod client {
"{} {} HTTP/1.1\r\n", "{} {} HTTP/1.1\r\n",
"Host: {}\r\n", "Host: {}\r\n",
"Connection: close\r\n", "Connection: close\r\n",
"Accept-Encoding: identity\r\n", // "Accept-Encoding: identity\r\n",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\r\n",
"Upgrade-Insecure-Requests: 1\r\n",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8\r\n",
"Dnt: 1\r\n",
"Accept-Language: en-US,en;q=0.9\r\n",
"Content-Type: application/x-www-form-urlencoded\r\n",
"Content-Length: 6\r\n",
"\r\n",
"q=duck\r\n",
"\r\n", "\r\n",
), ),
method, method,
@ -70,13 +79,14 @@ pub mod client {
pub fn request( pub fn request(
&self, &self,
method: &str, method: &str,
on_partial: fn(&[u8; PACKET_SIZE], &[u8]), // on_partial: Box<dyn FnMut(&[u8; PACKET_SIZE], &[u8])>,
) -> Result<Vec<u8>, Box<dyn Error>> { ) -> Result<Vec<u8>, Box<dyn Error>> {
let mut connection = self.create_connection()?; let mut connection = self.create_connection()?;
let mut sock = self.create_tcp_stream()?; let mut sock = self.create_tcp_stream()?;
let mut tls = rustls::Stream::new(&mut connection, &mut sock); let mut tls = rustls::Stream::new(&mut connection, &mut sock);
let http_header = self.create_http_header(method)?; let http_header = self.create_http_header(method)?;
dbg!(&http_header);
tls.write_all(&http_header.as_bytes())?; tls.write_all(&http_header.as_bytes())?;
// Read packages one by one // Read packages one by one
@ -86,7 +96,7 @@ pub mod client {
let n = tls.read(&mut buf)?; let n = tls.read(&mut buf)?;
on_partial(&buf, &data); // (on_partial)(&buf, &data);
if n == 0 { if n == 0 {
break; break;

2
src/engines.rs Normal file
View File

@ -0,0 +1,2 @@
pub mod duckduckgo;
pub mod engine_base;

90
src/engines/duckduckgo.rs Normal file
View File

@ -0,0 +1,90 @@
// Search engine parser for DuckDuckGo
pub mod duckduckgo {
// Results start at:
// <div id="links" class="results">
// Example for a result:
// <div class="result results_links results_links_deep web-result ">
// <div class="links_main links_deep result__body">
// <h2 class="result__title">
// <a
// rel="nofollow" class="result__a"
// href="https://www.speedtest.net/">
// Speedtest by Ookla - The Global Broadband Speed Test
// </a>
// </h2>
// <div class="result__extras">
// <div class="result__extras__url">
// <span class="result__icon">
// <a rel="nofollow" href="https://www.speedtest.net/">
// <img class="result__icon__img" width="16" height="16" alt=""
// src="//external-content.duckduckgo.com/ip3/www.speedtest.net.ico" name="i15" />
// </a>
// </span>
// <a class="result__url" href="https://www.speedtest.net/">
// www.speedtest.net
// </a>
// </div>
// </div>
// <a
// class="result__snippet"
// href="https://www.speedtest.net/">
// Use Speedtest on all your devices with our free desktop and mobile apps.
// </a>
// <div class="clear"></div>
// </div>
// </div>
use lazy_static::lazy_static;
use regex::Regex;
use crate::engines::engine_base::engine_base::{EngineBase, SearchResult};
lazy_static! {
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result.*?<a.*?href="(?<url>.*?)".*?>(?<title>.*?)<\/a>.*?class="result__snippet".*?>(?<description>.*?)<\/a>.*?class="clear".*?<\/div>( <\/div>){2}"#).unwrap();
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
}
pub struct DuckDuckGo {
pub search_results: Vec<SearchResult>,
results_started: bool,
previous_block: String,
}
impl EngineBase for DuckDuckGo {
fn get_search_results(&self) -> &Vec<SearchResult> {
&self.search_results
}
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
let raw_text = String::from_utf8_lossy(&bytes);
let text = STRIP.replace_all(&raw_text, " ");
if self.results_started {
self.previous_block.push_str(&text);
match SINGLE_RESULT.captures(&self.previous_block.to_owned()) {
Some(captures) => {
self.previous_block.clear();
println!("{}", &captures.name("title").unwrap().as_str());
println!("{}", &captures.name("description").unwrap().as_str());
println!("{}", &captures.name("url").unwrap().as_str());
}
None => {}
}
} else if RESULTS_START.is_match(&text) {
self.results_started = true;
}
}
}
impl DuckDuckGo {
pub fn new() -> Self {
Self {
search_results: Vec::new(),
results_started: false,
previous_block: String::new(),
}
}
}
}

View File

@ -0,0 +1,13 @@
pub mod engine_base {
pub struct SearchResult {
pub title: String,
pub url: String,
pub description: String,
}
pub trait EngineBase {
fn get_search_results(&self) -> &Vec<SearchResult>;
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>);
}
}

View File

@ -1,15 +1,21 @@
use std::{cmp::min, io::Read};
use client::client::{Client, PACKET_SIZE}; use client::client::{Client, PACKET_SIZE};
use engines::{duckduckgo::duckduckgo::DuckDuckGo, engine_base::engine_base::EngineBase};
pub mod client; pub mod client;
pub mod engines;
fn handle_response(packet: &[u8; PACKET_SIZE], bytes: &[u8]) {
println!("===========");
let response = String::from_utf8_lossy(packet);
dbg!(&packet.len());
println!("{}", response);
}
fn main() { fn main() {
let client = Client::new("https://www.google.com/"); let mut ddg = DuckDuckGo::new();
client.request(&"GET", handle_response); let client = Client::new("https://html.duckduckgo.com/html/");
let packets = client.request(&"POST").unwrap();
for ii in (0..packets.len()).step_by(PACKET_SIZE) {
let end_range = min(packets.len(), ii + PACKET_SIZE);
let slice = &packets[ii..end_range];
&ddg.parse_packet(slice.iter());
}
} }