This commit is contained in:
2025-05-29 11:43:28 +02:00
parent 999d8b92aa
commit b08cb54acd
4 changed files with 3450 additions and 2156 deletions

View File

@@ -12,5 +12,6 @@ ego-tree = "0.10.0"
env_logger = "0.11.8" env_logger = "0.11.8"
log = "0.4.27" log = "0.4.27"
merlin_env_helper = { version = "0.2.0", registry = "merlin" } merlin_env_helper = { version = "0.2.0", registry = "merlin" }
regex = "1.11.1"
reqwest = {version="0.12.15", features=["blocking"]} reqwest = {version="0.12.15", features=["blocking"]}
scraper = "0.23.1" scraper = "0.23.1"

View File

@@ -1,10 +1,11 @@
mod types; mod types;
use std::{fs::File, io::Read, net::Incoming, ops::Deref}; use std::{fs::File, io::Read, net::Incoming, ops::Deref, time::Duration};
use ego_tree::NodeRef; use ego_tree::NodeRef;
use regex::Regex;
use scraper::{Element, ElementRef, Node}; use scraper::{Element, ElementRef, Node};
use types::Icon; use types::{Icon, ParseType};
fn main() -> Result<(), Box<dyn std::error::Error>> { fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init(); env_logger::init();
@@ -98,7 +99,39 @@ fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
} }
let elt_verwendung = elt_verwendung.unwrap(); let elt_verwendung = elt_verwendung.unwrap();
let mut elt_verwendung = elt_verwendung.parent().unwrap();
let mut elt_ul = None;
while elt_verwendung.next_sibling().is_some() {
elt_verwendung = elt_verwendung.next_sibling().unwrap();
if !elt_verwendung.value().is_element() {
continue;
}
let elt = elt_verwendung.value().as_element().unwrap();
if elt.name() == "ul" {
elt_ul = Some(elt_verwendung);
break;
}
if elt.name() == "h2" {
eprintln!("Found 'h2' element, stopping search for 'ul'");
break;
}
}
if elt_ul.is_none() {
eprintln!("No second 'ul' element found after '#Verwendung'");
return Ok(false);
}
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
let li = elt_ul.select(&selector_li);
for item in li {
parse_li_to_resource(item);
}
Ok(true) Ok(true)
} }
@@ -108,21 +141,137 @@ fn parse_li_to_resource(item: ElementRef<'_>) {
return; return;
} }
let mut resource_items: Vec<NodeRef<'_, Node>> = Vec::new(); let mut resource_items: Vec<ParseType> = Vec::new();
let selector = scraper::Selector::parse("span,a,img,small").unwrap(); let selector = scraper::Selector::parse("span,a,img,small").unwrap();
let iter = item.select(&selector); let iter = item.select(&selector);
for child in iter { for child in iter {
println!( let elem = child.value();
"name: {}, text: {}, text1: {}", let name = elem.name();
child.value().name(),
get_text(child.deref()), if name == "a"
get_text1(child.deref()) && elem.attrs().find(|attr| attr.0 == "href").is_some()
); && elem.attrs().find(|attr| attr.0 == "title").is_some()
{
resource_items.push(ParseType::Link {
url: elem
.attrs()
.find(|attr| attr.0 == "href")
.unwrap()
.1
.to_string(),
title: elem
.attrs()
.find(|attr| attr.0 == "title")
.unwrap()
.1
.to_string(),
});
let txt = get_text_next(&child);
if !txt.is_empty() {
parse_text(&txt, &mut resource_items);
}
} else if name == "img"
&& elem.attrs().find(|attr| attr.0 == "data-src").is_some()
&& elem.attrs().find(|attr| attr.0 == "width").is_some()
&& elem.attrs().find(|attr| attr.0 == "height").is_some()
&& elem
.attrs()
.find(|attr| attr.0 == "data-image-name")
.is_some()
{
let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1;
let name = elem
.attrs()
.find(|attr| attr.0 == "data-image-name")
.unwrap()
.1;
let width = elem
.attrs()
.find(|attr| attr.0 == "width")
.unwrap()
.1
.parse()
.unwrap_or(0);
let height = elem
.attrs()
.find(|attr| attr.0 == "height")
.unwrap()
.1
.parse()
.unwrap_or(0);
resource_items.push(ParseType::Img(Icon {
name: name.to_string(),
url: url.to_string(),
width,
height,
content_type: "image/png".to_string(), // Assuming PNG, adjust as needed
}));
} else if name == "small" {
let txt = get_text(&child);
parse_text(&txt, &mut resource_items);
}
}
for item in resource_items.iter() {
match item {
ParseType::Link { url, title } => {
println!("Link: {} - {}", url, title);
}
ParseType::Img(icon) => {
println!(
"Image: {} ({}x{}) - {}",
icon.name, icon.width, icon.height, icon.url
);
}
ParseType::Count(count) => {
println!("Count: {}", count);
}
ParseType::ResourceAdd => {
println!("ResourceAdd");
}
ParseType::ResourceLast => {
println!("ResourceLast");
}
ParseType::Duration { duration, unit } => {
println!("Duration: {} {}", duration, unit);
}
}
} }
println!("======================"); println!("======================");
} }
fn parse_text(text: &str, resource_items: &mut Vec<ParseType>) {
let reg1 = Regex::new(r"^\s*x(?<count>\d+)\s+(?<end>[→+])\s*$").unwrap();
let reg2 = Regex::new(r"^\s*\(.*(?<duration>\d+,\d+)\ssek\./(?<unit>\w+)\s*\)$").unwrap();
if let Some(res) = reg1.captures(text) {
let count = res.name("count").unwrap().as_str().parse().unwrap_or(0);
resource_items.push(ParseType::Count(count));
let end = res.name("end").unwrap().as_str().to_string();
if end == "+" {
resource_items.push(ParseType::ResourceAdd);
} else {
resource_items.push(ParseType::ResourceLast);
}
}
if let Some(res) = reg2.captures(text) {
let duration_str = res.name("duration").unwrap().as_str();
let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0);
let unit = res.name("unit").unwrap().as_str().to_string();
let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds
resource_items.push(ParseType::Duration { duration, unit });
}
}
fn get_text(node: &NodeRef<'_, Node>) -> String { fn get_text(node: &NodeRef<'_, Node>) -> String {
if node.value().is_text() { if node.value().is_text() {
return node.value().as_text().unwrap().text.trim().to_string(); return node.value().as_text().unwrap().text.trim().to_string();
@@ -154,7 +303,7 @@ fn get_text(node: &NodeRef<'_, Node>) -> String {
String::new() String::new()
} }
fn get_text1(node: &NodeRef<'_, Node>) -> String { fn get_text_next(node: &NodeRef<'_, Node>) -> String {
if node.value().is_text() { if node.value().is_text() {
return node.value().as_text().unwrap().text.trim().to_string(); return node.value().as_text().unwrap().text.trim().to_string();
} }

View File

@@ -18,3 +18,12 @@ impl Clone for Icon {
} }
} }
} }
#[derive(Debug, PartialEq, Eq)]
pub enum ParseType {
Link { url: String, title: String },
Img(Icon),
Count(u32),
ResourceAdd,
ResourceLast,
Duration { duration: u64, unit: String },
}

5425
test.html

File diff suppressed because one or more lines are too long