update
This commit is contained in:
@@ -12,5 +12,6 @@ ego-tree = "0.10.0"
|
||||
env_logger = "0.11.8"
|
||||
log = "0.4.27"
|
||||
merlin_env_helper = { version = "0.2.0", registry = "merlin" }
|
||||
regex = "1.11.1"
|
||||
reqwest = {version="0.12.15", features=["blocking"]}
|
||||
scraper = "0.23.1"
|
||||
|
||||
169
src/main.rs
169
src/main.rs
@@ -1,10 +1,11 @@
|
||||
mod types;
|
||||
|
||||
use std::{fs::File, io::Read, net::Incoming, ops::Deref};
|
||||
use std::{fs::File, io::Read, net::Incoming, ops::Deref, time::Duration};
|
||||
|
||||
use ego_tree::NodeRef;
|
||||
use regex::Regex;
|
||||
use scraper::{Element, ElementRef, Node};
|
||||
use types::Icon;
|
||||
use types::{Icon, ParseType};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
env_logger::init();
|
||||
@@ -98,7 +99,39 @@ fn parse(html: &str) -> Result<bool, Box<dyn std::error::Error>> {
|
||||
}
|
||||
|
||||
let elt_verwendung = elt_verwendung.unwrap();
|
||||
let mut elt_verwendung = elt_verwendung.parent().unwrap();
|
||||
|
||||
let mut elt_ul = None;
|
||||
|
||||
while elt_verwendung.next_sibling().is_some() {
|
||||
elt_verwendung = elt_verwendung.next_sibling().unwrap();
|
||||
|
||||
if !elt_verwendung.value().is_element() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let elt = elt_verwendung.value().as_element().unwrap();
|
||||
if elt.name() == "ul" {
|
||||
elt_ul = Some(elt_verwendung);
|
||||
break;
|
||||
}
|
||||
|
||||
if elt.name() == "h2" {
|
||||
eprintln!("Found 'h2' element, stopping search for 'ul'");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if elt_ul.is_none() {
|
||||
eprintln!("No second 'ul' element found after '#Verwendung'");
|
||||
return Ok(false);
|
||||
}
|
||||
let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap();
|
||||
let li = elt_ul.select(&selector_li);
|
||||
|
||||
for item in li {
|
||||
parse_li_to_resource(item);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
@@ -108,21 +141,137 @@ fn parse_li_to_resource(item: ElementRef<'_>) {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut resource_items: Vec<NodeRef<'_, Node>> = Vec::new();
|
||||
let mut resource_items: Vec<ParseType> = Vec::new();
|
||||
|
||||
let selector = scraper::Selector::parse("span,a,img,small").unwrap();
|
||||
let iter = item.select(&selector);
|
||||
|
||||
for child in iter {
|
||||
println!(
|
||||
"name: {}, text: {}, text1: {}",
|
||||
child.value().name(),
|
||||
get_text(child.deref()),
|
||||
get_text1(child.deref())
|
||||
);
|
||||
let elem = child.value();
|
||||
let name = elem.name();
|
||||
|
||||
if name == "a"
|
||||
&& elem.attrs().find(|attr| attr.0 == "href").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "title").is_some()
|
||||
{
|
||||
resource_items.push(ParseType::Link {
|
||||
url: elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "href")
|
||||
.unwrap()
|
||||
.1
|
||||
.to_string(),
|
||||
title: elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "title")
|
||||
.unwrap()
|
||||
.1
|
||||
.to_string(),
|
||||
});
|
||||
|
||||
let txt = get_text_next(&child);
|
||||
|
||||
if !txt.is_empty() {
|
||||
parse_text(&txt, &mut resource_items);
|
||||
}
|
||||
} else if name == "img"
|
||||
&& elem.attrs().find(|attr| attr.0 == "data-src").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "width").is_some()
|
||||
&& elem.attrs().find(|attr| attr.0 == "height").is_some()
|
||||
&& elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "data-image-name")
|
||||
.is_some()
|
||||
{
|
||||
let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1;
|
||||
let name = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "data-image-name")
|
||||
.unwrap()
|
||||
.1;
|
||||
let width = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "width")
|
||||
.unwrap()
|
||||
.1
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
let height = elem
|
||||
.attrs()
|
||||
.find(|attr| attr.0 == "height")
|
||||
.unwrap()
|
||||
.1
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
|
||||
resource_items.push(ParseType::Img(Icon {
|
||||
name: name.to_string(),
|
||||
url: url.to_string(),
|
||||
width,
|
||||
height,
|
||||
content_type: "image/png".to_string(), // Assuming PNG, adjust as needed
|
||||
}));
|
||||
} else if name == "small" {
|
||||
let txt = get_text(&child);
|
||||
parse_text(&txt, &mut resource_items);
|
||||
}
|
||||
}
|
||||
for item in resource_items.iter() {
|
||||
match item {
|
||||
ParseType::Link { url, title } => {
|
||||
println!("Link: {} - {}", url, title);
|
||||
}
|
||||
ParseType::Img(icon) => {
|
||||
println!(
|
||||
"Image: {} ({}x{}) - {}",
|
||||
icon.name, icon.width, icon.height, icon.url
|
||||
);
|
||||
}
|
||||
ParseType::Count(count) => {
|
||||
println!("Count: {}", count);
|
||||
}
|
||||
ParseType::ResourceAdd => {
|
||||
println!("ResourceAdd");
|
||||
}
|
||||
ParseType::ResourceLast => {
|
||||
println!("ResourceLast");
|
||||
}
|
||||
ParseType::Duration { duration, unit } => {
|
||||
println!("Duration: {} {}", duration, unit);
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("======================");
|
||||
}
|
||||
|
||||
fn parse_text(text: &str, resource_items: &mut Vec<ParseType>) {
|
||||
let reg1 = Regex::new(r"^\s*x(?<count>\d+)\s+(?<end>[→+])\s*$").unwrap();
|
||||
let reg2 = Regex::new(r"^\s*\(.*(?<duration>\d+,\d+)\ssek\./(?<unit>\w+)\s*\)$").unwrap();
|
||||
|
||||
if let Some(res) = reg1.captures(text) {
|
||||
let count = res.name("count").unwrap().as_str().parse().unwrap_or(0);
|
||||
|
||||
resource_items.push(ParseType::Count(count));
|
||||
|
||||
let end = res.name("end").unwrap().as_str().to_string();
|
||||
|
||||
if end == "+" {
|
||||
resource_items.push(ParseType::ResourceAdd);
|
||||
} else {
|
||||
resource_items.push(ParseType::ResourceLast);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(res) = reg2.captures(text) {
|
||||
let duration_str = res.name("duration").unwrap().as_str();
|
||||
let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0);
|
||||
let unit = res.name("unit").unwrap().as_str().to_string();
|
||||
let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds
|
||||
|
||||
resource_items.push(ParseType::Duration { duration, unit });
|
||||
}
|
||||
}
|
||||
|
||||
fn get_text(node: &NodeRef<'_, Node>) -> String {
|
||||
if node.value().is_text() {
|
||||
return node.value().as_text().unwrap().text.trim().to_string();
|
||||
@@ -154,7 +303,7 @@ fn get_text(node: &NodeRef<'_, Node>) -> String {
|
||||
String::new()
|
||||
}
|
||||
|
||||
fn get_text1(node: &NodeRef<'_, Node>) -> String {
|
||||
fn get_text_next(node: &NodeRef<'_, Node>) -> String {
|
||||
if node.value().is_text() {
|
||||
return node.value().as_text().unwrap().text.trim().to_string();
|
||||
}
|
||||
|
||||
@@ -17,4 +17,13 @@ impl Clone for Icon {
|
||||
content_type: self.content_type.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum ParseType {
|
||||
Link { url: String, title: String },
|
||||
Img(Icon),
|
||||
Count(u32),
|
||||
ResourceAdd,
|
||||
ResourceLast,
|
||||
Duration { duration: u64, unit: String },
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user