diff --git a/Cargo.toml b/Cargo.toml index f717115..55883db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,5 +12,6 @@ ego-tree = "0.10.0" env_logger = "0.11.8" log = "0.4.27" merlin_env_helper = { version = "0.2.0", registry = "merlin" } +regex = "1.11.1" reqwest = {version="0.12.15", features=["blocking"]} scraper = "0.23.1" diff --git a/src/main.rs b/src/main.rs index bd40d20..48efd9c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,11 @@ mod types; -use std::{fs::File, io::Read, net::Incoming, ops::Deref}; +use std::{fs::File, io::Read, net::Incoming, ops::Deref, time::Duration}; use ego_tree::NodeRef; +use regex::Regex; use scraper::{Element, ElementRef, Node}; -use types::Icon; +use types::{Icon, ParseType}; fn main() -> Result<(), Box> { env_logger::init(); @@ -98,7 +99,39 @@ fn parse(html: &str) -> Result> { } let elt_verwendung = elt_verwendung.unwrap(); + let mut elt_verwendung = elt_verwendung.parent().unwrap(); + let mut elt_ul = None; + + while elt_verwendung.next_sibling().is_some() { + elt_verwendung = elt_verwendung.next_sibling().unwrap(); + + if !elt_verwendung.value().is_element() { + continue; + } + + let elt = elt_verwendung.value().as_element().unwrap(); + if elt.name() == "ul" { + elt_ul = Some(elt_verwendung); + break; + } + + if elt.name() == "h2" { + eprintln!("Found 'h2' element, stopping search for 'ul'"); + break; + } + } + + if elt_ul.is_none() { + eprintln!("No second 'ul' element found after '#Verwendung'"); + return Ok(false); + } + let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap(); + let li = elt_ul.select(&selector_li); + + for item in li { + parse_li_to_resource(item); + } Ok(true) } @@ -108,21 +141,137 @@ fn parse_li_to_resource(item: ElementRef<'_>) { return; } - let mut resource_items: Vec> = Vec::new(); + let mut resource_items: Vec = Vec::new(); + let selector = scraper::Selector::parse("span,a,img,small").unwrap(); let iter = item.select(&selector); for child in iter { - println!( - "name: {}, text: {}, text1: {}", - child.value().name(), - get_text(child.deref()), - get_text1(child.deref()) - ); + let elem = child.value(); + let name = elem.name(); + + if name == "a" + && elem.attrs().find(|attr| attr.0 == "href").is_some() + && elem.attrs().find(|attr| attr.0 == "title").is_some() + { + resource_items.push(ParseType::Link { + url: elem + .attrs() + .find(|attr| attr.0 == "href") + .unwrap() + .1 + .to_string(), + title: elem + .attrs() + .find(|attr| attr.0 == "title") + .unwrap() + .1 + .to_string(), + }); + + let txt = get_text_next(&child); + + if !txt.is_empty() { + parse_text(&txt, &mut resource_items); + } + } else if name == "img" + && elem.attrs().find(|attr| attr.0 == "data-src").is_some() + && elem.attrs().find(|attr| attr.0 == "width").is_some() + && elem.attrs().find(|attr| attr.0 == "height").is_some() + && elem + .attrs() + .find(|attr| attr.0 == "data-image-name") + .is_some() + { + let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1; + let name = elem + .attrs() + .find(|attr| attr.0 == "data-image-name") + .unwrap() + .1; + let width = elem + .attrs() + .find(|attr| attr.0 == "width") + .unwrap() + .1 + .parse() + .unwrap_or(0); + let height = elem + .attrs() + .find(|attr| attr.0 == "height") + .unwrap() + .1 + .parse() + .unwrap_or(0); + + resource_items.push(ParseType::Img(Icon { + name: name.to_string(), + url: url.to_string(), + width, + height, + content_type: "image/png".to_string(), // Assuming PNG, adjust as needed + })); + } else if name == "small" { + let txt = get_text(&child); + parse_text(&txt, &mut resource_items); + } + } + for item in resource_items.iter() { + match item { + ParseType::Link { url, title } => { + println!("Link: {} - {}", url, title); + } + ParseType::Img(icon) => { + println!( + "Image: {} ({}x{}) - {}", + icon.name, icon.width, icon.height, icon.url + ); + } + ParseType::Count(count) => { + println!("Count: {}", count); + } + ParseType::ResourceAdd => { + println!("ResourceAdd"); + } + ParseType::ResourceLast => { + println!("ResourceLast"); + } + ParseType::Duration { duration, unit } => { + println!("Duration: {} {}", duration, unit); + } + } } println!("======================"); } +fn parse_text(text: &str, resource_items: &mut Vec) { + let reg1 = Regex::new(r"^\s*x(?\d+)\s+(?[→+])\s*$").unwrap(); + let reg2 = Regex::new(r"^\s*\(.*(?\d+,\d+)\ssek\./(?\w+)\s*\)$").unwrap(); + + if let Some(res) = reg1.captures(text) { + let count = res.name("count").unwrap().as_str().parse().unwrap_or(0); + + resource_items.push(ParseType::Count(count)); + + let end = res.name("end").unwrap().as_str().to_string(); + + if end == "+" { + resource_items.push(ParseType::ResourceAdd); + } else { + resource_items.push(ParseType::ResourceLast); + } + } + + if let Some(res) = reg2.captures(text) { + let duration_str = res.name("duration").unwrap().as_str(); + let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0); + let unit = res.name("unit").unwrap().as_str().to_string(); + let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds + + resource_items.push(ParseType::Duration { duration, unit }); + } +} + fn get_text(node: &NodeRef<'_, Node>) -> String { if node.value().is_text() { return node.value().as_text().unwrap().text.trim().to_string(); @@ -154,7 +303,7 @@ fn get_text(node: &NodeRef<'_, Node>) -> String { String::new() } -fn get_text1(node: &NodeRef<'_, Node>) -> String { +fn get_text_next(node: &NodeRef<'_, Node>) -> String { if node.value().is_text() { return node.value().as_text().unwrap().text.trim().to_string(); } diff --git a/src/types/types.rs b/src/types/types.rs index 38374cc..a171394 100644 --- a/src/types/types.rs +++ b/src/types/types.rs @@ -17,4 +17,13 @@ impl Clone for Icon { content_type: self.content_type.clone(), } } -} \ No newline at end of file +} +#[derive(Debug, PartialEq, Eq)] +pub enum ParseType { + Link { url: String, title: String }, + Img(Icon), + Count(u32), + ResourceAdd, + ResourceLast, + Duration { duration: u64, unit: String }, +} diff --git a/test.html b/test.html index 019348a..4277517 100644 --- a/test.html +++ b/test.html @@ -1,1835 +1,2702 @@ - - -Salz – No Man's Sky Wiki - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + Salz – No Man's Sky Wiki + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - + } + /* eslint-enable */ + + + + + + + + + + - if ( contentWidthPreference === 'expanded' ) { - document.documentElement.classList.add('is-content-expanded'); - } - - - - - - -
- - - - -
-
- - -
- - - No Man's Sky Wiki - - -
-
- - -
- -
-
-
- -
-
-
-
- -
-
-
Advertisement
-
-
-
- - -
-
-
-
- - - -
-
- -
- - - +
- -
-
-
Advertisement
-
+
+ + + +
+
+
+
Advertisement
+
+ + + + + + + + + + - - - - - - - \ No newline at end of file