mirror of
https://github.com/Myzel394/tifsep.git
synced 2025-06-18 23:45:27 +02:00
feat: Add first wip for parsers
This commit is contained in:
parent
d65f4365f0
commit
d2e9d75b0e
40
Cargo.lock
generated
40
Cargo.lock
generated
@ -17,6 +17,15 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "autocfg"
|
name = "autocfg"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
@ -537,6 +546,35 @@ dependencies = [
|
|||||||
"bitflags 1.3.2",
|
"bitflags 1.3.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.10.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "reqwest"
|
name = "reqwest"
|
||||||
version = "0.11.23"
|
version = "0.11.23"
|
||||||
@ -787,6 +825,8 @@ dependencies = [
|
|||||||
name = "tcp_test"
|
name = "tcp_test"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rustls",
|
"rustls",
|
||||||
"tokio",
|
"tokio",
|
||||||
|
@ -6,6 +6,8 @@ edition = "2021"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
lazy_static = "1.4.0"
|
||||||
|
regex = "1.10.3"
|
||||||
reqwest = "0.11.23"
|
reqwest = "0.11.23"
|
||||||
rustls = "0.22.2"
|
rustls = "0.22.2"
|
||||||
tokio = "1.35.1"
|
tokio = "1.35.1"
|
||||||
|
@ -58,7 +58,16 @@ pub mod client {
|
|||||||
"{} {} HTTP/1.1\r\n",
|
"{} {} HTTP/1.1\r\n",
|
||||||
"Host: {}\r\n",
|
"Host: {}\r\n",
|
||||||
"Connection: close\r\n",
|
"Connection: close\r\n",
|
||||||
"Accept-Encoding: identity\r\n",
|
// "Accept-Encoding: identity\r\n",
|
||||||
|
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\r\n",
|
||||||
|
"Upgrade-Insecure-Requests: 1\r\n",
|
||||||
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8\r\n",
|
||||||
|
"Dnt: 1\r\n",
|
||||||
|
"Accept-Language: en-US,en;q=0.9\r\n",
|
||||||
|
"Content-Type: application/x-www-form-urlencoded\r\n",
|
||||||
|
"Content-Length: 6\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"q=duck\r\n",
|
||||||
"\r\n",
|
"\r\n",
|
||||||
),
|
),
|
||||||
method,
|
method,
|
||||||
@ -70,13 +79,14 @@ pub mod client {
|
|||||||
pub fn request(
|
pub fn request(
|
||||||
&self,
|
&self,
|
||||||
method: &str,
|
method: &str,
|
||||||
on_partial: fn(&[u8; PACKET_SIZE], &[u8]),
|
// on_partial: Box<dyn FnMut(&[u8; PACKET_SIZE], &[u8])>,
|
||||||
) -> Result<Vec<u8>, Box<dyn Error>> {
|
) -> Result<Vec<u8>, Box<dyn Error>> {
|
||||||
let mut connection = self.create_connection()?;
|
let mut connection = self.create_connection()?;
|
||||||
|
|
||||||
let mut sock = self.create_tcp_stream()?;
|
let mut sock = self.create_tcp_stream()?;
|
||||||
let mut tls = rustls::Stream::new(&mut connection, &mut sock);
|
let mut tls = rustls::Stream::new(&mut connection, &mut sock);
|
||||||
let http_header = self.create_http_header(method)?;
|
let http_header = self.create_http_header(method)?;
|
||||||
|
dbg!(&http_header);
|
||||||
tls.write_all(&http_header.as_bytes())?;
|
tls.write_all(&http_header.as_bytes())?;
|
||||||
|
|
||||||
// Read packages one by one
|
// Read packages one by one
|
||||||
@ -86,7 +96,7 @@ pub mod client {
|
|||||||
|
|
||||||
let n = tls.read(&mut buf)?;
|
let n = tls.read(&mut buf)?;
|
||||||
|
|
||||||
on_partial(&buf, &data);
|
// (on_partial)(&buf, &data);
|
||||||
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
break;
|
break;
|
||||||
|
2
src/engines.rs
Normal file
2
src/engines.rs
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pub mod duckduckgo;
|
||||||
|
pub mod engine_base;
|
90
src/engines/duckduckgo.rs
Normal file
90
src/engines/duckduckgo.rs
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
// Search engine parser for DuckDuckGo
|
||||||
|
pub mod duckduckgo {
|
||||||
|
// Results start at:
|
||||||
|
// <div id="links" class="results">
|
||||||
|
// Example for a result:
|
||||||
|
// <div class="result results_links results_links_deep web-result ">
|
||||||
|
// <div class="links_main links_deep result__body">
|
||||||
|
// <h2 class="result__title">
|
||||||
|
// <a
|
||||||
|
// rel="nofollow" class="result__a"
|
||||||
|
// href="https://www.speedtest.net/">
|
||||||
|
// Speedtest by Ookla - The Global Broadband Speed Test
|
||||||
|
// </a>
|
||||||
|
// </h2>
|
||||||
|
// <div class="result__extras">
|
||||||
|
// <div class="result__extras__url">
|
||||||
|
// <span class="result__icon">
|
||||||
|
// <a rel="nofollow" href="https://www.speedtest.net/">
|
||||||
|
// <img class="result__icon__img" width="16" height="16" alt=""
|
||||||
|
// src="//external-content.duckduckgo.com/ip3/www.speedtest.net.ico" name="i15" />
|
||||||
|
// </a>
|
||||||
|
// </span>
|
||||||
|
// <a class="result__url" href="https://www.speedtest.net/">
|
||||||
|
// www.speedtest.net
|
||||||
|
// </a>
|
||||||
|
// </div>
|
||||||
|
// </div>
|
||||||
|
// <a
|
||||||
|
// class="result__snippet"
|
||||||
|
// href="https://www.speedtest.net/">
|
||||||
|
// Use Speedtest on all your devices with our free desktop and mobile apps.
|
||||||
|
// </a>
|
||||||
|
// <div class="clear"></div>
|
||||||
|
// </div>
|
||||||
|
// </div>
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
use crate::engines::engine_base::engine_base::{EngineBase, SearchResult};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref RESULTS_START: Regex = Regex::new(r#"id=\"links\""#).unwrap();
|
||||||
|
static ref SINGLE_RESULT: Regex = Regex::new(r#"<div class="result.*?<a.*?href="(?<url>.*?)".*?>(?<title>.*?)<\/a>.*?class="result__snippet".*?>(?<description>.*?)<\/a>.*?class="clear".*?<\/div>( <\/div>){2}"#).unwrap();
|
||||||
|
static ref STRIP: Regex = Regex::new(r"\s+").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DuckDuckGo {
|
||||||
|
pub search_results: Vec<SearchResult>,
|
||||||
|
results_started: bool,
|
||||||
|
previous_block: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EngineBase for DuckDuckGo {
|
||||||
|
fn get_search_results(&self) -> &Vec<SearchResult> {
|
||||||
|
&self.search_results
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>) {
|
||||||
|
let bytes: Vec<u8> = packet.map(|bit| *bit).collect();
|
||||||
|
let raw_text = String::from_utf8_lossy(&bytes);
|
||||||
|
let text = STRIP.replace_all(&raw_text, " ");
|
||||||
|
|
||||||
|
if self.results_started {
|
||||||
|
self.previous_block.push_str(&text);
|
||||||
|
|
||||||
|
match SINGLE_RESULT.captures(&self.previous_block.to_owned()) {
|
||||||
|
Some(captures) => {
|
||||||
|
self.previous_block.clear();
|
||||||
|
println!("{}", &captures.name("title").unwrap().as_str());
|
||||||
|
println!("{}", &captures.name("description").unwrap().as_str());
|
||||||
|
println!("{}", &captures.name("url").unwrap().as_str());
|
||||||
|
}
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
} else if RESULTS_START.is_match(&text) {
|
||||||
|
self.results_started = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DuckDuckGo {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
search_results: Vec::new(),
|
||||||
|
results_started: false,
|
||||||
|
previous_block: String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
13
src/engines/engine_base.rs
Normal file
13
src/engines/engine_base.rs
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
pub mod engine_base {
|
||||||
|
pub struct SearchResult {
|
||||||
|
pub title: String,
|
||||||
|
pub url: String,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait EngineBase {
|
||||||
|
fn get_search_results(&self) -> &Vec<SearchResult>;
|
||||||
|
|
||||||
|
fn parse_packet<'a>(&mut self, packet: impl Iterator<Item = &'a u8>);
|
||||||
|
}
|
||||||
|
}
|
24
src/main.rs
24
src/main.rs
@ -1,15 +1,21 @@
|
|||||||
|
use std::{cmp::min, io::Read};
|
||||||
|
|
||||||
use client::client::{Client, PACKET_SIZE};
|
use client::client::{Client, PACKET_SIZE};
|
||||||
|
use engines::{duckduckgo::duckduckgo::DuckDuckGo, engine_base::engine_base::EngineBase};
|
||||||
|
|
||||||
pub mod client;
|
pub mod client;
|
||||||
|
pub mod engines;
|
||||||
fn handle_response(packet: &[u8; PACKET_SIZE], bytes: &[u8]) {
|
|
||||||
println!("===========");
|
|
||||||
let response = String::from_utf8_lossy(packet);
|
|
||||||
dbg!(&packet.len());
|
|
||||||
println!("{}", response);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let client = Client::new("https://www.google.com/");
|
let mut ddg = DuckDuckGo::new();
|
||||||
client.request(&"GET", handle_response);
|
let client = Client::new("https://html.duckduckgo.com/html/");
|
||||||
|
|
||||||
|
let packets = client.request(&"POST").unwrap();
|
||||||
|
|
||||||
|
for ii in (0..packets.len()).step_by(PACKET_SIZE) {
|
||||||
|
let end_range = min(packets.len(), ii + PACKET_SIZE);
|
||||||
|
|
||||||
|
let slice = &packets[ii..end_range];
|
||||||
|
&ddg.parse_packet(slice.iter());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user