Change metadata
This commit is contained in:
155
src/main.rs
Normal file
155
src/main.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
use std::{fs::File, io::{Read}};
|
||||
|
||||
use scraper::{ElementRef, Node};
|
||||
use ego_tree::NodeRef;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::init();
|
||||
let html = read("test.html")?;
|
||||
parse(&html)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
fn _download_file(url: &str, _path: &str) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Some simple CLI args requirements...
|
||||
|
||||
eprintln!("Fetching {url:?}...");
|
||||
|
||||
// reqwest::blocking::get() is a convenience function.
|
||||
//
|
||||
// In most cases, you should create/build a reqwest::Client and reuse
|
||||
// it for all requests.
|
||||
let res = reqwest::blocking::get(url)?;
|
||||
|
||||
|
||||
let body = res.text()?;
|
||||
Ok(body)
|
||||
}
|
||||
|
||||
fn read(path: &str) -> Result<String, std::io::Error> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut contents = String::new();
|
||||
file.read_to_string(&mut contents)?;
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
|
||||
// Parse the HTML content
|
||||
let document = scraper::Html::parse_document(html);
|
||||
let selector_quelle = scraper::Selector::parse("#Quelle").unwrap();
|
||||
let selector_verwendung = scraper::Selector::parse("#Verwendung").unwrap();
|
||||
let selector_li = scraper::Selector::parse("li").unwrap();
|
||||
|
||||
let elt_quelle= document.select(&selector_quelle).next();
|
||||
|
||||
if elt_quelle.is_none() {
|
||||
eprintln!("No element found with the selector '#Quelle'");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let elt_quelle = elt_quelle.unwrap();
|
||||
|
||||
let mut elt_quelle = elt_quelle.parent().unwrap();
|
||||
|
||||
let mut c = 0;
|
||||
let mut elt_ul = None;
|
||||
|
||||
while elt_quelle. next_sibling().is_some() {
|
||||
|
||||
elt_quelle = elt_quelle.next_sibling().unwrap();
|
||||
|
||||
if !elt_quelle.value().is_element(){
|
||||
continue;
|
||||
}
|
||||
|
||||
let elt = elt_quelle.value().as_element().unwrap();
|
||||
if elt.name() == "ul" {
|
||||
c += 1;
|
||||
|
||||
if c > 1 {
|
||||
elt_ul = Some(elt_quelle);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt.name() == "h2" {
|
||||
eprintln!("Found 'h2' element, stopping search for 'ul'");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt_ul.is_none() {
|
||||
eprintln!("No second 'ul' element found after '#Quelle'");
|
||||
return Ok(false);
|
||||
}
|
||||
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
|
||||
let li = elt_ul.select(&selector_li);
|
||||
|
||||
for item in li
|
||||
{
|
||||
parse_li_to_resource(item);
|
||||
}
|
||||
|
||||
let elt_verwendung= document.select(&selector_verwendung).next();
|
||||
|
||||
if elt_verwendung.is_none() {
|
||||
eprintln!("No element found with the selector '#Verwendung'");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let elt_verwendung = elt_verwendung.unwrap();
|
||||
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn parse_li_to_resource(item: ElementRef<'_>) {
|
||||
if !item.has_children() {
|
||||
println!("Item has no children, skipping.");
|
||||
return;
|
||||
}
|
||||
|
||||
let mut resource_items = Vec::new();
|
||||
|
||||
let first_child = item.first_child().unwrap();
|
||||
|
||||
resource_items.push(first_child);
|
||||
|
||||
let iter = first_child.next_siblings();
|
||||
|
||||
for next in iter {
|
||||
if next.value().is_text() {
|
||||
resource_items.push(next);
|
||||
}
|
||||
else if next.value().is_element() {
|
||||
if next.value().as_element().unwrap().name() == "span" {
|
||||
parse_resource (resource_items);
|
||||
resource_items = Vec::new();
|
||||
}
|
||||
|
||||
resource_items.push(next);
|
||||
}
|
||||
}
|
||||
|
||||
println!("======================");
|
||||
}
|
||||
|
||||
fn parse_resource(resource_items: Vec<NodeRef<'_, Node>>) {
|
||||
if resource_items.is_empty() {
|
||||
println!("No resource items to parse.");
|
||||
return;
|
||||
}
|
||||
|
||||
println!("Parsing resource items...");
|
||||
|
||||
for item in resource_items {
|
||||
if item.value().is_text() {
|
||||
println!("Text: {}", item.value().as_text().unwrap().text.trim_ascii());
|
||||
continue;
|
||||
}
|
||||
println!("Resource: {:?}", item.value());
|
||||
}
|
||||
|
||||
println!("------------------");
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user