|
|
|
@@ -0,0 +1,155 @@
|
|
|
|
|
use std::{fs::File, io::{Read}};
|
|
|
|
|
|
|
|
|
|
use scraper::{ElementRef, Node};
|
|
|
|
|
use ego_tree::NodeRef;
|
|
|
|
|
|
|
|
|
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
|
|
env_logger::init();
|
|
|
|
|
let html = read("test.html")?;
|
|
|
|
|
parse(&html)?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fn _download_file(url: &str, _path: &str) -> Result<String, Box<dyn std::error::Error>> {
|
|
|
|
|
// Some simple CLI args requirements...
|
|
|
|
|
|
|
|
|
|
eprintln!("Fetching {url:?}...");
|
|
|
|
|
|
|
|
|
|
// reqwest::blocking::get() is a convenience function.
|
|
|
|
|
//
|
|
|
|
|
// In most cases, you should create/build a reqwest::Client and reuse
|
|
|
|
|
// it for all requests.
|
|
|
|
|
let res = reqwest::blocking::get(url)?;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let body = res.text()?;
|
|
|
|
|
Ok(body)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn read(path: &str) -> Result<String, std::io::Error> {
|
|
|
|
|
let mut file = File::open(path)?;
|
|
|
|
|
let mut contents = String::new();
|
|
|
|
|
file.read_to_string(&mut contents)?;
|
|
|
|
|
Ok(contents)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
|
|
|
|
|
// Parse the HTML content
|
|
|
|
|
let document = scraper::Html::parse_document(html);
|
|
|
|
|
let selector_quelle = scraper::Selector::parse("#Quelle").unwrap();
|
|
|
|
|
let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap();
|
|
|
|
|
let selector_li = scraper::Selector::parse("li").unwrap();
|
|
|
|
|
|
|
|
|
|
let elt_quelle= document.select(&selector_quelle).next();
|
|
|
|
|
|
|
|
|
|
if elt_quelle.is_none() {
|
|
|
|
|
eprintln!("No element found with the selector '#Quelle'");
|
|
|
|
|
return Ok(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let elt_quelle = elt_quelle.unwrap();
|
|
|
|
|
|
|
|
|
|
let mut elt_quelle = elt_quelle.parent().unwrap();
|
|
|
|
|
|
|
|
|
|
let mut c = 0;
|
|
|
|
|
let mut elt_ul = None;
|
|
|
|
|
|
|
|
|
|
while elt_quelle. next_sibling().is_some() {
|
|
|
|
|
|
|
|
|
|
elt_quelle = elt_quelle.next_sibling().unwrap();
|
|
|
|
|
|
|
|
|
|
if !elt_quelle.value().is_element(){
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let elt = elt_quelle.value().as_element().unwrap();
|
|
|
|
|
if elt.name() == "ul" {
|
|
|
|
|
c += 1;
|
|
|
|
|
|
|
|
|
|
if c > 1 {
|
|
|
|
|
elt_ul = Some(elt_quelle);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if elt.name() == "h2" {
|
|
|
|
|
eprintln!("Found 'h2' element, stopping search for 'ul'");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if elt_ul.is_none() {
|
|
|
|
|
eprintln!("No second 'ul' element found after '#Quelle'");
|
|
|
|
|
return Ok(false);
|
|
|
|
|
}
|
|
|
|
|
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
|
|
|
|
|
let li = elt_ul.select(&selector_li);
|
|
|
|
|
|
|
|
|
|
for item in li
|
|
|
|
|
{
|
|
|
|
|
parse_li_to_resource(item);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let elt_verwendung= document.select(&selector_verwendung).next();
|
|
|
|
|
|
|
|
|
|
if elt_verwendung.is_none() {
|
|
|
|
|
eprintln!("No element found with the selector '#Verwendung'");
|
|
|
|
|
return Ok(false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let elt_verwendung = elt_verwendung.unwrap();
|
|
|
|
|
|
|
|
|
|
Ok(true)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_li_to_resource(item: ElementRef<'_>) {
|
|
|
|
|
if !item.has_children() {
|
|
|
|
|
println!("Item has no children, skipping.");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut resource_items = Vec::new();
|
|
|
|
|
|
|
|
|
|
let first_child = item.first_child().unwrap();
|
|
|
|
|
|
|
|
|
|
resource_items.push(first_child);
|
|
|
|
|
|
|
|
|
|
let iter = first_child.next_siblings();
|
|
|
|
|
|
|
|
|
|
for next in iter {
|
|
|
|
|
if next.value().is_text() {
|
|
|
|
|
resource_items.push(next);
|
|
|
|
|
}
|
|
|
|
|
else if next.value().is_element() {
|
|
|
|
|
if next.value().as_element().unwrap().name() == "span" {
|
|
|
|
|
parse_resource (resource_items);
|
|
|
|
|
resource_items = Vec::new();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resource_items.push(next);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
println!("======================");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_resource(resource_items: Vec<NodeRef<'_, Node>>) {
|
|
|
|
|
if resource_items.is_empty() {
|
|
|
|
|
println!("No resource items to parse.");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
println!("Parsing resource items...");
|
|
|
|
|
|
|
|
|
|
for item in resource_items {
|
|
|
|
|
if item.value().is_text() {
|
|
|
|
|
println!("Text: {}", item.value().as_text().unwrap().text.trim_ascii());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
println!("Resource: {:?}", item.value());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
println!("------------------");
|
|
|
|
|
|
|
|
|
|
}
|