mirror of
https://github.com/Myzel394/tifsep.git
synced 2025-06-18 07:25:26 +02:00
feat: Add first wip for parsers
This commit is contained in:
parent
d65f4365f0
commit
d2e9d75b0e
40
Cargo.lock
generated
40
Cargo.lock
generated
@ -17,6 +17,15 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
@ -537,6 +546,35 @@ dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.11.23"
|
||||
@ -787,6 +825,8 @@ dependencies = [
|
||||
name = "tcp_test"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
"tokio",
|
||||
|
@ -6,6 +6,8 @@ edition = "2021"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
regex = "1.10.3"
|
||||
reqwest = "0.11.23"
|
||||
rustls = "0.22.2"
|
||||
tokio = "1.35.1"
|
||||
|
@ -58,7 +58,16 @@ pub mod client {
|
||||
"{} {} HTTP/1.1\r\n",
|
||||
"Host: {}\r\n",
|
||||
"Connection: close\r\n",
|
||||
"Accept-Encoding: identity\r\n",
|
||||
// "Accept-Encoding: identity\r\n",
|
||||
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\r\n",
|
||||
"Upgrade-Insecure-Requests: 1\r\n",
|
||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8\r\n",
|
||||
"Dnt: 1\r\n",
|
||||
"Accept-Language: en-US,en;q=0.9\r\n",
|
||||
"Content-Type: application/x-www-form-urlencoded\r\n",
|
||||
"Content-Length: 6\r\n",
|
||||
"\r\n",
|
||||
"q=duck\r\n",
|
||||
"\r\n",
|
||||
),
|
||||
method,
|
||||
@ -70,13 +79,14 @@ pub mod client {
|
||||
pub fn request(
|
||||
&self,
|
||||
method: &str,
|
||||
on_partial: fn(&[u8; PACKET_SIZE], &[u8]),
|
||||
// on_partial: Box<dyn FnMut(&[u8; PACKET_SIZE], &[u8])>,
|
||||
) -> Result<Vec<u8>, Box<dyn Error>> {
|
||||
let mut connection = self.create_connection()?;
|
||||
|
||||
let mut sock = self.create_tcp_stream()?;
|
||||
let mut tls = rustls::Stream::new(&mut connection, &mut sock);
|
||||
let http_header = self.create_http_header(method)?;
|
||||
dbg!(&http_header);
|
||||
tls.write_all(&http_header.as_bytes())?;
|
||||
|
||||
// Read packages one by one
|
||||
@ -86,7 +96,7 @@ pub mod client {
|
||||
|
||||
let n = tls.read(&mut buf)?;
|
||||
|
||||
on_partial(&buf, &data);
|
||||
// (on_partial)(&buf, &data);
|
||||
|
||||
if n == 0 {
|
||||
break;
|
||||
|
2
src/engines.rs
Normal file
2
src/engines.rs
Normal file
@ -0,0 +1,2 @@
|
||||
pub mod duckduckgo;
|
||||
pub mod engine_base;
|
90
src/engines/duckduckgo.rs
Normal file
90
src/engines/duckduckgo.rs
Normal file
@ -0,0 +1,90 @@
|
||||
// Search engine parser for DuckDuckGo
|
||||
pub mod duckduckgo {
|
||||
// Results start at:
|
||||
// <div id="links" class="results">
|
||||
// Example for a result:
|
||||
// <div class="result results_links results_links_deep web-result ">
|
||||
// <div class="links_main links_deep result__body">
|
||||
// <h2 class="result__title">
|
||||
// <a
|
||||
// rel="nofollow" class="result__a"
|
||||
// href="https://www.speedtest.net/">
|
||||
// Speedtest by Ookla - The Global Broadband Speed Test
|
||||
// </a>
|
||||
// </h2>
|
||||
// <div class="result__extras">
|
||||
// <div class="result__extras__url">
|
||||
// <span class="result__icon">
|
||||
// <a rel="nofollow" href="https://www.speedtest.net/">
|
||||
// <img class="result__icon__img" width="16" height="16" alt=""
|
||||
// src="//external-content.duckduckgo.com/ip3/www.speedtest.net.ico" name="i15" />
|
||||
// </a>
|
||||
// </span>
|
||||
// <a class="result__url" href="https://www.speedtest.net/">
|
||||
// www.speedtest.net
|
||||
// </a>
|
||||
// </div>
|
||||
// </div>
|
||||
// <a
|
||||
// class="result__snippet"
|
||||
// href="https://www.speedtest.net/">
|
||||
// Use Speedtest on all your devices with our free desktop and mobile apps.
|
||||
// </a>
|
||||
// <div class="clear"></div>
|
||||
// </div>
|
||||
// </div>
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::engines::engine_base::engine_base::{EngineBase, SearchResult};
|
||||
|
||||
lazy_static! {
|
||||
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
|
||||
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result.*?<a.*?href="(?<url>.*?)".*?>(?<title>.*?)<\/a>.*?class="result__snippet".*?>(?<description>.*?)<\/a>.*?class="clear".*?<\/div>( <\/div>){2}"#).unwrap();
|
||||
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
||||
}
|
||||
|
||||
pub struct DuckDuckGo {
|
||||
pub search_results: Vec<SearchResult>,
|
||||
results_started: bool,
|
||||
previous_block: String,
|
||||
}
|
||||
|
||||
impl EngineBase for DuckDuckGo {
|
||||
fn get_search_results(&self) -> &Vec<SearchResult> {
|
||||
&self.search_results
|
||||
}
|
||||
|
||||
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
|
||||
let raw_text = String::from_utf8_lossy(&bytes);
|
||||
let text = STRIP.replace_all(&raw_text, " ");
|
||||
|
||||
if self.results_started {
|
||||
self.previous_block.push_str(&text);
|
||||
|
||||
match SINGLE_RESULT.captures(&self.previous_block.to_owned()) {
|
||||
Some(captures) => {
|
||||
self.previous_block.clear();
|
||||
println!("{}", &captures.name("title").unwrap().as_str());
|
||||
println!("{}", &captures.name("description").unwrap().as_str());
|
||||
println!("{}", &captures.name("url").unwrap().as_str());
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
} else if RESULTS_START.is_match(&text) {
|
||||
self.results_started = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DuckDuckGo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
search_results: Vec::new(),
|
||||
results_started: false,
|
||||
previous_block: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
13
src/engines/engine_base.rs
Normal file
13
src/engines/engine_base.rs
Normal file
@ -0,0 +1,13 @@
|
||||
pub mod engine_base {
|
||||
pub struct SearchResult {
|
||||
pub title: String,
|
||||
pub url: String,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
pub trait EngineBase {
|
||||
fn get_search_results(&self) -> &Vec<SearchResult>;
|
||||
|
||||
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>);
|
||||
}
|
||||
}
|
24
src/main.rs
24
src/main.rs
@ -1,15 +1,21 @@
|
||||
use std::{cmp::min, io::Read};
|
||||
|
||||
use client::client::{Client, PACKET_SIZE};
|
||||
use engines::{duckduckgo::duckduckgo::DuckDuckGo, engine_base::engine_base::EngineBase};
|
||||
|
||||
pub mod client;
|
||||
|
||||
fn handle_response(packet: &[u8; PACKET_SIZE], bytes: &[u8]) {
|
||||
println!("===========");
|
||||
let response = String::from_utf8_lossy(packet);
|
||||
dbg!(&packet.len());
|
||||
println!("{}", response);
|
||||
}
|
||||
pub mod engines;
|
||||
|
||||
fn main() {
|
||||
let client = Client::new("https://www.google.com/");
|
||||
client.request(&"GET", handle_response);
|
||||
let mut ddg = DuckDuckGo::new();
|
||||
let client = Client::new("https://html.duckduckgo.com/html/");
|
||||
|
||||
let packets = client.request(&"POST").unwrap();
|
||||
|
||||
for ii in (0..packets.len()).step_by(PACKET_SIZE) {
|
||||
let end_range = min(packets.len(), ii + PACKET_SIZE);
|
||||
|
||||
let slice = &packets[ii..end_range];
|
||||
&ddg.parse_packet(slice.iter());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user