Change metadata

This commit is contained in:
2025-05-27 15:43:44 +02:00
parent d2a8ddad8a
commit ce3cf003ff
10 changed files with 2519 additions and 0 deletions

155
src/main.rs Normal file
View File

@@ -0,0 +1,155 @@
use std::{fs::File, io::{Read}};
use scraper::{ElementRef, Node};
use ego_tree::NodeRef;
fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();
let html = read("test.html")?;
parse(&html)?;
Ok(())
}
fn _download_file(url: &str, _path: &str) -> Result<String, Box<dyn std::error::Error>> {
// Some simple CLI args requirements...
eprintln!("Fetching {url:?}...");
// reqwest::blocking::get() is a convenience function.
//
// In most cases, you should create/build a reqwest::Client and reuse
// it for all requests.
let res = reqwest::blocking::get(url)?;
let body = res.text()?;
Ok(body)
}
fn read(path: &str) -> Result<String, std::io::Error> {
let mut file = File::open(path)?;
let mut contents = String::new();
file.read_to_string(&mut contents)?;
Ok(contents)
}
fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
// Parse the HTML content
let document = scraper::Html::parse_document(html);
let selector_quelle = scraper::Selector::parse("#Quelle").unwrap();
let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap();
let selector_li = scraper::Selector::parse("li").unwrap();
let elt_quelle= document.select(&selector_quelle).next();
if elt_quelle.is_none() {
eprintln!("No element found with the selector '#Quelle'");
return Ok(false);
}
let elt_quelle = elt_quelle.unwrap();
let mut elt_quelle = elt_quelle.parent().unwrap();
let mut c = 0;
let mut elt_ul = None;
while elt_quelle. next_sibling().is_some() {
elt_quelle = elt_quelle.next_sibling().unwrap();
if !elt_quelle.value().is_element(){
continue;
}
let elt = elt_quelle.value().as_element().unwrap();
if elt.name() == "ul" {
c += 1;
if c > 1 {
elt_ul = Some(elt_quelle);
break;
}
}
if elt.name() == "h2" {
eprintln!("Found 'h2' element, stopping search for 'ul'");
break;
}
}
if elt_ul.is_none() {
eprintln!("No second 'ul' element found after '#Quelle'");
return Ok(false);
}
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
let li = elt_ul.select(&selector_li);
for item in li
{
parse_li_to_resource(item);
}
let elt_verwendung= document.select(&selector_verwendung).next();
if elt_verwendung.is_none() {
eprintln!("No element found with the selector '#Verwendung'");
return Ok(false);
}
let elt_verwendung = elt_verwendung.unwrap();
Ok(true)
}
fn parse_li_to_resource(item: ElementRef<'_>) {
if !item.has_children() {
println!("Item has no children, skipping.");
return;
}
let mut resource_items = Vec::new();
let first_child = item.first_child().unwrap();
resource_items.push(first_child);
let iter = first_child.next_siblings();
for next in iter {
if next.value().is_text() {
resource_items.push(next);
}
else if next.value().is_element() {
if next.value().as_element().unwrap().name() == "span" {
parse_resource (resource_items);
resource_items = Vec::new();
}
resource_items.push(next);
}
}
println!("======================");
}
fn parse_resource(resource_items: Vec<NodeRef<'_, Node>>) {
if resource_items.is_empty() {
println!("No resource items to parse.");
return;
}
println!("Parsing resource items...");
for item in resource_items {
if item.value().is_text() {
println!("Text: {}", item.value().as_text().unwrap().text.trim_ascii());
continue;
}
println!("Resource: {:?}", item.value());
}
println!("------------------");
}