Parser update

This commit is contained in:
2025-06-01 21:32:06 +02:00
parent 4f55658b87
commit d2589affe5
7 changed files with 8218 additions and 370 deletions

View File

@@ -1,16 +1,14 @@
mod parse;
mod types;
use std::{fs::File, io::Read, net::Incoming, ops::Deref, time::Duration};
use std::{fs::File, io::Read};
use ego_tree::NodeRef;
use regex::Regex;
use scraper::{Element, ElementRef, Node};
use types::{Icon, ParseType};
use parse::parse;
fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();
let html = read("test.html")?;
parse(&html)?;
let html = read("test_vc.html")?;
parse(&html);
Ok(())
}
@@ -35,358 +33,3 @@ fn read(path: &str) -> Result<String, std::io::Error> {
file.read_to_string(&mut contents)?;
Ok(contents)
}
fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
// Parse the HTML content
let document = scraper::Html::parse_document(html);
let selector_quelle = scraper::Selector::parse("#Quelle").unwrap();
let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap();
let selector_li = scraper::Selector::parse("li").unwrap();
let elt_quelle = document.select(&selector_quelle).next();
if elt_quelle.is_none() {
eprintln!("No element found with the selector '#Quelle'");
return Ok(false);
}
let elt_quelle = elt_quelle.unwrap();
let mut elt_quelle = elt_quelle.parent().unwrap();
let mut c = 0;
let mut elt_ul = None;
while elt_quelle.next_sibling().is_some() {
elt_quelle = elt_quelle.next_sibling().unwrap();
if !elt_quelle.value().is_element() {
continue;
}
let elt = elt_quelle.value().as_element().unwrap();
if elt.name() == "ul" {
c += 1;
if c > 1 {
elt_ul = Some(elt_quelle);
break;
}
}
if elt.name() == "h2" {
eprintln!("Found 'h2' element, stopping search for 'ul'");
break;
}
}
if elt_ul.is_none() {
eprintln!("No second 'ul' element found after '#Quelle'");
return Ok(false);
}
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
let li = elt_ul.select(&selector_li);
for item in li {
parse_li_to_resource(item);
}
let elt_verwendung = document.select(&selector_verwendung).next();
if elt_verwendung.is_none() {
eprintln!("No element found with the selector '#Verwendung'");
return Ok(false);
}
let elt_verwendung = elt_verwendung.unwrap();
let mut elt_verwendung = elt_verwendung.parent().unwrap();
let mut elt_ul = None;
while elt_verwendung.next_sibling().is_some() {
elt_verwendung = elt_verwendung.next_sibling().unwrap();
if !elt_verwendung.value().is_element() {
continue;
}
let elt = elt_verwendung.value().as_element().unwrap();
if elt.name() == "ul" {
elt_ul = Some(elt_verwendung);
break;
}
if elt.name() == "h2" {
eprintln!("Found 'h2' element, stopping search for 'ul'");
break;
}
}
if elt_ul.is_none() {
eprintln!("No second 'ul' element found after '#Verwendung'");
return Ok(false);
}
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
let li = elt_ul.select(&selector_li);
for item in li {
parse_li_to_resource(item);
}
Ok(true)
}
fn parse_li_to_resource(item: ElementRef<'_>) {
if !item.has_children() {
println!("Item has no children, skipping.");
return;
}
let mut resource_items: Vec<ParseType> = Vec::new();
let selector = scraper::Selector::parse("span,a,img,small").unwrap();
let iter = item.select(&selector);
for child in iter {
let elem = child.value();
let name = elem.name();
if name == "a"
&& elem.attrs().find(|attr| attr.0 == "href").is_some()
&& elem.attrs().find(|attr| attr.0 == "title").is_some()
{
resource_items.push(ParseType::Link {
url: elem
.attrs()
.find(|attr| attr.0 == "href")
.unwrap()
.1
.to_string(),
title: elem
.attrs()
.find(|attr| attr.0 == "title")
.unwrap()
.1
.to_string(),
});
let txt = get_text_next(&child);
if !txt.is_empty() {
parse_text(&txt, &mut resource_items);
}
} else if name == "img"
&& elem.attrs().find(|attr| attr.0 == "data-src").is_some()
&& elem.attrs().find(|attr| attr.0 == "width").is_some()
&& elem.attrs().find(|attr| attr.0 == "height").is_some()
&& elem
.attrs()
.find(|attr| attr.0 == "data-image-name")
.is_some()
{
let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1;
let name = elem
.attrs()
.find(|attr| attr.0 == "data-image-name")
.unwrap()
.1;
let width = elem
.attrs()
.find(|attr| attr.0 == "width")
.unwrap()
.1
.parse()
.unwrap_or(0);
let height = elem
.attrs()
.find(|attr| attr.0 == "height")
.unwrap()
.1
.parse()
.unwrap_or(0);
resource_items.push(ParseType::Img(Icon {
name: name.to_string(),
url: url.to_string(),
width,
height,
content_type: "image/png".to_string(), // Assuming PNG, adjust as needed
}));
} else if name == "small" {
let txt = get_text(&child);
parse_text(&txt, &mut resource_items);
}
}
for item in resource_items.iter() {
match item {
ParseType::Link { url, title } => {
println!("Link: {} - {}", url, title);
}
ParseType::Img(icon) => {
println!(
"Image: {} ({}x{}) - {}",
icon.name, icon.width, icon.height, icon.url
);
}
ParseType::Count(count) => {
println!("Count: {}", count);
}
ParseType::ResourceAdd => {
println!("ResourceAdd");
}
ParseType::ResourceLast => {
println!("ResourceLast");
}
ParseType::Duration { duration, unit } => {
println!("Duration: {} {}", duration, unit);
}
}
}
println!("======================");
}
fn parse_text(text: &str, resource_items: &mut Vec<ParseType>) {
let reg1 = Regex::new(r"^\s*x(?<count>\d+)\s+(?<end>[→+])\s*$").unwrap();
let reg2 = Regex::new(r"^\s*\(.*(?<duration>\d+,\d+)\ssek\./(?<unit>\w+)\s*\)$").unwrap();
if let Some(res) = reg1.captures(text) {
let count = res.name("count").unwrap().as_str().parse().unwrap_or(0);
resource_items.push(ParseType::Count(count));
let end = res.name("end").unwrap().as_str().to_string();
if end == "+" {
resource_items.push(ParseType::ResourceAdd);
} else {
resource_items.push(ParseType::ResourceLast);
}
}
if let Some(res) = reg2.captures(text) {
let duration_str = res.name("duration").unwrap().as_str();
let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0);
let unit = res.name("unit").unwrap().as_str().to_string();
let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds
resource_items.push(ParseType::Duration { duration, unit });
}
}
fn get_text(node: &NodeRef<'_, Node>) -> String {
if node.value().is_text() {
return node.value().as_text().unwrap().text.trim().to_string();
}
if node.value().is_element() {
let mut text = String::new();
for child in node.children() {
text.push_str(&get_text(&child));
}
if !text.is_empty() {
return text;
}
}
let next = node.next_sibling();
if !next.is_some() {
return String::new();
}
let next = next.unwrap();
if next.value().is_text() {
return next.value().as_text().unwrap().text.trim().to_string();
}
String::new()
}
fn get_text_next(node: &NodeRef<'_, Node>) -> String {
if node.value().is_text() {
return node.value().as_text().unwrap().text.trim().to_string();
}
let next = node.next_sibling();
if !next.is_some() {
return String::new();
}
let next = next.unwrap();
if next.value().is_text() {
return next.value().as_text().unwrap().text.trim().to_string();
}
String::new()
}
fn add_all_children<'a>(
child: NodeRef<'a, Node>,
resource_items: &mut Vec<NodeRef<'a, Node>>,
) -> bool {
let mut result = false;
if child.value().is_text() {
resource_items.push(child);
} else if child.value().is_element() {
let selector = scraper::Selector::parse("a,img,small").unwrap();
if child.value().as_element().unwrap().name() == "span" {
result = true;
}
if child.value().as_element().unwrap().name() == "a" {
resource_items.push(child);
}
let items = ElementRef::wrap(child).unwrap().select(&selector);
for item in items {
let x = item.deref();
resource_items.push(*x);
}
}
result
}
fn parse_resource(resource_items: Vec<NodeRef<'_, Node>>) {
if resource_items.is_empty() {
println!("No resource items to parse.");
return;
}
let mut url: Option<&str> = Option::None;
let mut title: Option<&str> = Option::None;
let mut icon: Option<Icon> = Option::None;
println!("Parsing resource items...");
for item in resource_items {
if item.value().is_text() {
println!(
"Text: {}",
item.value().as_text().unwrap().text.trim_ascii()
);
continue;
}
check_item(&item, &mut url, &mut title, &mut icon);
println!("Resource: {:?}", url);
}
println!("------------------");
}
fn check_item(
item: &NodeRef<'_, Node>,
url: &mut Option<&str>,
title: &mut Option<&str>,
icon: &mut Option<Icon>,
) {
println!(
"Checking item: {} {:?}",
item.value().as_element().unwrap().name(),
item.value().as_element().unwrap().attrs
);
*url = Some("test");
}