From 999d8b92aad856fcf5d4aa2428624aec3a5c9004 Mon Sep 17 00:00:00 2001 From: Stefan Menner Date: Wed, 28 May 2025 17:16:10 +0200 Subject: [PATCH] Update --- snipped.html | 112 +++++++++++++++++++++++++++++++ src/main.rs | 162 ++++++++++++++++++++++++++++++++++----------- src/types/types.rs | 20 ++++++ 3 files changed, 257 insertions(+), 37 deletions(-) diff --git a/snipped.html b/snipped.html index bd0aa35..91a7c6c 100644 --- a/snipped.html +++ b/snipped.html @@ -74,3 +74,115 @@ x1  →   x1   ( "Schnelle Formation/Verdunstung", 0,08 sek./Stück) + diff --git a/src/main.rs b/src/main.rs index f473b14..bd40d20 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,10 @@ -use std::{fs::File, io::{Read}}; +mod types; + +use std::{fs::File, io::Read, net::Incoming, ops::Deref}; -use scraper::{ElementRef, Node}; use ego_tree::NodeRef; +use scraper::{Element, ElementRef, Node}; +use types::Icon; fn main() -> Result<(), Box> { env_logger::init(); @@ -10,7 +13,6 @@ fn main() -> Result<(), Box> { Ok(()) } - fn _download_file(url: &str, _path: &str) -> Result> { // Some simple CLI args requirements... @@ -22,13 +24,12 @@ fn _download_file(url: &str, _path: &str) -> Result Result { - let mut file = File::open(path)?; + let mut file = File::open(path)?; let mut contents = String::new(); file.read_to_string(&mut contents)?; Ok(contents) @@ -41,8 +42,8 @@ fn parse(html: &str) -> Result> { let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap(); let selector_li = scraper::Selector::parse("li").unwrap(); - let elt_quelle= document.select(&selector_quelle).next(); - + let elt_quelle = document.select(&selector_quelle).next(); + if elt_quelle.is_none() { eprintln!("No element found with the selector '#Quelle'"); return Ok(false); @@ -55,14 +56,13 @@ fn parse(html: &str) -> Result> { let mut c = 0; let mut elt_ul = None; - while elt_quelle. next_sibling().is_some() { - + while elt_quelle.next_sibling().is_some() { elt_quelle = elt_quelle.next_sibling().unwrap(); - - if !elt_quelle.value().is_element(){ + + if !elt_quelle.value().is_element() { continue; } - + let elt = elt_quelle.value().as_element().unwrap(); if elt.name() == "ul" { c += 1; @@ -84,22 +84,21 @@ fn parse(html: &str) -> Result> { return Ok(false); } let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap(); - let li = elt_ul.select(&selector_li); + let li = elt_ul.select(&selector_li); - for item in li - { + for item in li { parse_li_to_resource(item); } - let elt_verwendung= document.select(&selector_verwendung).next(); - + let elt_verwendung = document.select(&selector_verwendung).next(); + if elt_verwendung.is_none() { eprintln!("No element found with the selector '#Verwendung'"); return Ok(false); } let elt_verwendung = elt_verwendung.unwrap(); - + Ok(true) } @@ -109,29 +108,97 @@ fn parse_li_to_resource(item: ElementRef<'_>) { return; } - let mut resource_items = Vec::new(); + let mut resource_items: Vec> = Vec::new(); + let selector = scraper::Selector::parse("span,a,img,small").unwrap(); + let iter = item.select(&selector); - let first_child = item.first_child().unwrap(); - - resource_items.push(first_child); + for child in iter { + println!( + "name: {}, text: {}, text1: {}", + child.value().name(), + get_text(child.deref()), + get_text1(child.deref()) + ); + } + println!("======================"); +} - let iter = first_child.next_siblings(); +fn get_text(node: &NodeRef<'_, Node>) -> String { + if node.value().is_text() { + return node.value().as_text().unwrap().text.trim().to_string(); + } - for next in iter { - if next.value().is_text() { - resource_items.push(next); + if node.value().is_element() { + let mut text = String::new(); + + for child in node.children() { + text.push_str(&get_text(&child)); } - else if next.value().is_element() { - if next.value().as_element().unwrap().name() == "span" { - parse_resource (resource_items); - resource_items = Vec::new(); - } - resource_items.push(next); + if !text.is_empty() { + return text; } } - println!("======================"); + let next = node.next_sibling(); + if !next.is_some() { + return String::new(); + } + + let next = next.unwrap(); + + if next.value().is_text() { + return next.value().as_text().unwrap().text.trim().to_string(); + } + + String::new() +} + +fn get_text1(node: &NodeRef<'_, Node>) -> String { + if node.value().is_text() { + return node.value().as_text().unwrap().text.trim().to_string(); + } + let next = node.next_sibling(); + if !next.is_some() { + return String::new(); + } + + let next = next.unwrap(); + + if next.value().is_text() { + return next.value().as_text().unwrap().text.trim().to_string(); + } + + String::new() +} + +fn add_all_children<'a>( + child: NodeRef<'a, Node>, + resource_items: &mut Vec>, +) -> bool { + let mut result = false; + + if child.value().is_text() { + resource_items.push(child); + } else if child.value().is_element() { + let selector = scraper::Selector::parse("a,img,small").unwrap(); + if child.value().as_element().unwrap().name() == "span" { + result = true; + } + + if child.value().as_element().unwrap().name() == "a" { + resource_items.push(child); + } + + let items = ElementRef::wrap(child).unwrap().select(&selector); + + for item in items { + let x = item.deref(); + resource_items.push(*x); + } + } + + result } fn parse_resource(resource_items: Vec>) { @@ -140,16 +207,37 @@ fn parse_resource(resource_items: Vec>) { return; } + let mut url: Option<&str> = Option::None; + let mut title: Option<&str> = Option::None; + let mut icon: Option = Option::None; + println!("Parsing resource items..."); for item in resource_items { if item.value().is_text() { - println!("Text: {}", item.value().as_text().unwrap().text.trim_ascii()); + println!( + "Text: {}", + item.value().as_text().unwrap().text.trim_ascii() + ); continue; } - println!("Resource: {:?}", item.value()); + check_item(&item, &mut url, &mut title, &mut icon); + println!("Resource: {:?}", url); } println!("------------------"); - -} \ No newline at end of file +} + +fn check_item( + item: &NodeRef<'_, Node>, + url: &mut Option<&str>, + title: &mut Option<&str>, + icon: &mut Option, +) { + println!( + "Checking item: {} {:?}", + item.value().as_element().unwrap().name(), + item.value().as_element().unwrap().attrs + ); + *url = Some("test"); +} diff --git a/src/types/types.rs b/src/types/types.rs index e69de29..38374cc 100644 --- a/src/types/types.rs +++ b/src/types/types.rs @@ -0,0 +1,20 @@ +#[derive(Debug, PartialEq, Eq)] +pub struct Icon { + pub name: String, + pub url: String, + pub width: u32, + pub height: u32, + pub content_type: String, +} + +impl Clone for Icon { + fn clone(&self) -> Self { + Icon { + name: self.name.clone(), + url: self.url.clone(), + width: self.width, + height: self.height, + content_type: self.content_type.clone(), + } + } +} \ No newline at end of file