Parser update
This commit is contained in:
367
src/main.rs
367
src/main.rs
@@ -1,16 +1,14 @@
|
||||
mod parse;
|
||||
mod types;
|
||||
|
||||
use std::{fs::File, io::Read, net::Incoming, ops::Deref, time::Duration};
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use ego_tree::NodeRef;
|
||||
use regex::Regex;
|
||||
use scraper::{Element, ElementRef, Node};
|
||||
use types::{Icon, ParseType};
|
||||
use parse::parse;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::init();
|
||||
let html = read("test.html")?;
|
||||
parse(&html)?;
|
||||
let html = read("test_vc.html")?;
|
||||
parse(&html);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -35,358 +33,3 @@ fn read(path: &str) -> Result<String, std::io::Error> {
|
||||
file.read_to_string(&mut contents)?;
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
|
||||
// Parse the HTML content
|
||||
let document = scraper::Html::parse_document(html);
|
||||
let selector_quelle = scraper::Selector::parse("#Quelle").unwrap();
|
||||
let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap();
|
||||
let selector_li = scraper::Selector::parse("li").unwrap();
|
||||
|
||||
let elt_quelle = document.select(&selector_quelle).next();
|
||||
|
||||
if elt_quelle.is_none() {
|
||||
eprintln!("No element found with the selector '#Quelle'");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let elt_quelle = elt_quelle.unwrap();
|
||||
|
||||
let mut elt_quelle = elt_quelle.parent().unwrap();
|
||||
|
||||
let mut c = 0;
|
||||
let mut elt_ul = None;
|
||||
|
||||
while elt_quelle.next_sibling().is_some() {
|
||||
elt_quelle = elt_quelle.next_sibling().unwrap();
|
||||
|
||||
if !elt_quelle.value().is_element() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let elt = elt_quelle.value().as_element().unwrap();
|
||||
if elt.name() == "ul" {
|
||||
c += 1;
|
||||
|
||||
if c > 1 {
|
||||
elt_ul = Some(elt_quelle);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt.name() == "h2" {
|
||||
eprintln!("Found 'h2' element, stopping search for 'ul'");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt_ul.is_none() {
|
||||
eprintln!("No second 'ul' element found after '#Quelle'");
|
||||
return Ok(false);
|
||||
}
|
||||
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
|
||||
let li = elt_ul.select(&selector_li);
|
||||
|
||||
for item in li {
|
||||
parse_li_to_resource(item);
|
||||
}
|
||||
|
||||
let elt_verwendung = document.select(&selector_verwendung).next();
|
||||
|
||||
if elt_verwendung.is_none() {
|
||||
eprintln!("No element found with the selector '#Verwendung'");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let elt_verwendung = elt_verwendung.unwrap();
|
||||
let mut elt_verwendung = elt_verwendung.parent().unwrap();
|
||||
|
||||
let mut elt_ul = None;
|
||||
|
||||
while elt_verwendung.next_sibling().is_some() {
|
||||
elt_verwendung = elt_verwendung.next_sibling().unwrap();
|
||||
|
||||
if !elt_verwendung.value().is_element() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let elt = elt_verwendung.value().as_element().unwrap();
|
||||
if elt.name() == "ul" {
|
||||
elt_ul = Some(elt_verwendung);
|
||||
break;
|
||||
}
|
||||
|
||||
if elt.name() == "h2" {
|
||||
eprintln!("Found 'h2' element, stopping search for 'ul'");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt_ul.is_none() {
|
||||
eprintln!("No second 'ul' element found after '#Verwendung'");
|
||||
return Ok(false);
|
||||
}
|
||||
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
|
||||
let li = elt_ul.select(&selector_li);
|
||||
|
||||
for item in li {
|
||||
parse_li_to_resource(item);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn parse_li_to_resource(item: ElementRef<'_>) {
|
||||
if !item.has_children() {
|
||||
println!("Item has no children, skipping.");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut resource_items: Vec<ParseType> = Vec::new();
|
||||
|
||||
let selector = scraper::Selector::parse("span,a,img,small").unwrap();
|
||||
let iter = item.select(&selector);
|
||||
|
||||
for child in iter {
|
||||
let elem = child.value();
|
||||
let name = elem.name();
|
||||
|
||||
if name == "a"
|
||||
&& elem.attrs().find(|attr| attr.0 == "href").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "title").is_some()
|
||||
{
|
||||
resource_items.push(ParseType::Link {
|
||||
url: elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "href")
|
||||
.unwrap()
|
||||
.1
|
||||
.to_string(),
|
||||
title: elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "title")
|
||||
.unwrap()
|
||||
.1
|
||||
.to_string(),
|
||||
});
|
||||
|
||||
let txt = get_text_next(&child);
|
||||
|
||||
if !txt.is_empty() {
|
||||
parse_text(&txt, &mut resource_items);
|
||||
}
|
||||
} else if name == "img"
|
||||
&& elem.attrs().find(|attr| attr.0 == "data-src").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "width").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "height").is_some()
|
||||
&& elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "data-image-name")
|
||||
.is_some()
|
||||
{
|
||||
let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1;
|
||||
let name = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "data-image-name")
|
||||
.unwrap()
|
||||
.1;
|
||||
let width = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "width")
|
||||
.unwrap()
|
||||
.1
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
let height = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "height")
|
||||
.unwrap()
|
||||
.1
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
|
||||
resource_items.push(ParseType::Img(Icon {
|
||||
name: name.to_string(),
|
||||
url: url.to_string(),
|
||||
width,
|
||||
height,
|
||||
content_type: "image/png".to_string(), // Assuming PNG, adjust as needed
|
||||
}));
|
||||
} else if name == "small" {
|
||||
let txt = get_text(&child);
|
||||
parse_text(&txt, &mut resource_items);
|
||||
}
|
||||
}
|
||||
for item in resource_items.iter() {
|
||||
match item {
|
||||
ParseType::Link { url, title } => {
|
||||
println!("Link: {} - {}", url, title);
|
||||
}
|
||||
ParseType::Img(icon) => {
|
||||
println!(
|
||||
"Image: {} ({}x{}) - {}",
|
||||
icon.name, icon.width, icon.height, icon.url
|
||||
);
|
||||
}
|
||||
ParseType::Count(count) => {
|
||||
println!("Count: {}", count);
|
||||
}
|
||||
ParseType::ResourceAdd => {
|
||||
println!("ResourceAdd");
|
||||
}
|
||||
ParseType::ResourceLast => {
|
||||
println!("ResourceLast");
|
||||
}
|
||||
ParseType::Duration { duration, unit } => {
|
||||
println!("Duration: {} {}", duration, unit);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("======================");
|
||||
}
|
||||
|
||||
fn parse_text(text: &str, resource_items: &mut Vec<ParseType>) {
|
||||
let reg1 = Regex::new(r"^\s*x(?<count>\d+)\s+(?<end>[→+])\s*$").unwrap();
|
||||
let reg2 = Regex::new(r"^\s*\(.*(?<duration>\d+,\d+)\ssek\./(?<unit>\w+)\s*\)$").unwrap();
|
||||
|
||||
if let Some(res) = reg1.captures(text) {
|
||||
let count = res.name("count").unwrap().as_str().parse().unwrap_or(0);
|
||||
|
||||
resource_items.push(ParseType::Count(count));
|
||||
|
||||
let end = res.name("end").unwrap().as_str().to_string();
|
||||
|
||||
if end == "+" {
|
||||
resource_items.push(ParseType::ResourceAdd);
|
||||
} else {
|
||||
resource_items.push(ParseType::ResourceLast);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(res) = reg2.captures(text) {
|
||||
let duration_str = res.name("duration").unwrap().as_str();
|
||||
let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0);
|
||||
let unit = res.name("unit").unwrap().as_str().to_string();
|
||||
let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds
|
||||
|
||||
resource_items.push(ParseType::Duration { duration, unit });
|
||||
}
|
||||
}
|
||||
|
||||
fn get_text(node: &NodeRef<'_, Node>) -> String {
|
||||
if node.value().is_text() {
|
||||
return node.value().as_text().unwrap().text.trim().to_string();
|
||||
}
|
||||
|
||||
if node.value().is_element() {
|
||||
let mut text = String::new();
|
||||
|
||||
for child in node.children() {
|
||||
text.push_str(&get_text(&child));
|
||||
}
|
||||
|
||||
if !text.is_empty() {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
let next = node.next_sibling();
|
||||
if !next.is_some() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let next = next.unwrap();
|
||||
|
||||
if next.value().is_text() {
|
||||
return next.value().as_text().unwrap().text.trim().to_string();
|
||||
}
|
||||
|
||||
String::new()
|
||||
}
|
||||
|
||||
fn get_text_next(node: &NodeRef<'_, Node>) -> String {
|
||||
if node.value().is_text() {
|
||||
return node.value().as_text().unwrap().text.trim().to_string();
|
||||
}
|
||||
let next = node.next_sibling();
|
||||
if !next.is_some() {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let next = next.unwrap();
|
||||
|
||||
if next.value().is_text() {
|
||||
return next.value().as_text().unwrap().text.trim().to_string();
|
||||
}
|
||||
|
||||
String::new()
|
||||
}
|
||||
|
||||
fn add_all_children<'a>(
|
||||
child: NodeRef<'a, Node>,
|
||||
resource_items: &mut Vec<NodeRef<'a, Node>>,
|
||||
) -> bool {
|
||||
let mut result = false;
|
||||
|
||||
if child.value().is_text() {
|
||||
resource_items.push(child);
|
||||
} else if child.value().is_element() {
|
||||
let selector = scraper::Selector::parse("a,img,small").unwrap();
|
||||
if child.value().as_element().unwrap().name() == "span" {
|
||||
result = true;
|
||||
}
|
||||
|
||||
if child.value().as_element().unwrap().name() == "a" {
|
||||
resource_items.push(child);
|
||||
}
|
||||
|
||||
let items = ElementRef::wrap(child).unwrap().select(&selector);
|
||||
|
||||
for item in items {
|
||||
let x = item.deref();
|
||||
resource_items.push(*x);
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn parse_resource(resource_items: Vec<NodeRef<'_, Node>>) {
|
||||
if resource_items.is_empty() {
|
||||
println!("No resource items to parse.");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut url: Option<&str> = Option::None;
|
||||
let mut title: Option<&str> = Option::None;
|
||||
let mut icon: Option<Icon> = Option::None;
|
||||
|
||||
println!("Parsing resource items...");
|
||||
|
||||
for item in resource_items {
|
||||
if item.value().is_text() {
|
||||
println!(
|
||||
"Text: {}",
|
||||
item.value().as_text().unwrap().text.trim_ascii()
|
||||
);
|
||||
continue;
|
||||
}
|
||||
check_item(&item, &mut url, &mut title, &mut icon);
|
||||
println!("Resource: {:?}", url);
|
||||
}
|
||||
|
||||
println!("------------------");
|
||||
}
|
||||
|
||||
fn check_item(
|
||||
item: &NodeRef<'_, Node>,
|
||||
url: &mut Option<&str>,
|
||||
title: &mut Option<&str>,
|
||||
icon: &mut Option<Icon>,
|
||||
) {
|
||||
println!(
|
||||
"Checking item: {} {:?}",
|
||||
item.value().as_element().unwrap().name(),
|
||||
item.value().as_element().unwrap().attrs
|
||||
);
|
||||
*url = Some("test");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user