From c1c0a0601c5d7a3ee4b9f468bcd16d35e0fda1f2 Mon Sep 17 00:00:00 2001 From: Stefan Menner Date: Tue, 3 Jun 2025 17:38:11 +0200 Subject: [PATCH] Update parser --- src/main.rs | 2 +- src/parse/mod.rs | 673 +++++++++++++++------------------------------ src/types/types.rs | 9 +- 3 files changed, 238 insertions(+), 446 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5c31315..eb7c0e4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ use parse::parse; fn main() -> Result<(), Box> { env_logger::init(); - let html = read("test_vc.html")?; + let html = read("test_mordit.html")?; parse(&html); Ok(()) } diff --git a/src/parse/mod.rs b/src/parse/mod.rs index dd2412a..ffd4946 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, hash::Hash}; +use std::collections::HashMap; use regex::Regex; use select::{ @@ -7,7 +7,7 @@ use select::{ predicate::{Attr, Name, Or}, }; -use crate::types::{Icon, Ingredient, Recipe, RecipeType, Resource, ResourceState}; +use crate::types::{Duration, Icon, Ingredient, Recipe, RecipeType, Resource, ResourceState}; #[derive(Debug, PartialEq, Eq)] enum ParseType { @@ -17,7 +17,61 @@ enum ParseType { Resource(String), ResourceAdd, ResourceLast, - Duration { duration: u64, unit: String }, + DashDash, + Duration(Duration), +} + +type ResourceTmp = ( + Option, // name + Option, // title + Option, // icon + Option, // count + Option, // duration +); + +pub fn parse(html: &str) { + let document = Document::from(html); + + let mut map_resource: HashMap = HashMap::new(); + let mut recipes: Vec = Vec::new(); + + parse_source(&document, &mut map_resource, &mut recipes); + + parse_dst( + &document, + "Raffination", + RecipeType::Refining, + &mut map_resource, + &mut recipes, + ); + + parse_dst( + &document, + "Herstellung", + RecipeType::Production, + &mut map_resource, + &mut recipes, + ); + + parse_dst( + &document, + "Raffination_2", + RecipeType::Refining, + &mut map_resource, + &mut recipes, + ); + + parse_dst( + &document, + "Kochen", + RecipeType::Cooking, + &mut map_resource, + &mut recipes, + ); + + print_recipe(&recipes, RecipeType::Refining); + print_recipe(&recipes, RecipeType::Production); + print_recipe(&recipes, RecipeType::Cooking); } fn first(iter: &mut impl Iterator) -> Option { @@ -25,7 +79,7 @@ fn first(iter: &mut impl Iterator) -> Option { } fn first_child_element(node: Node<'_>) -> Option> { - if (node.children().next().is_none()) { + if node.children().next().is_none() { return None; } @@ -44,17 +98,126 @@ fn first_child_element(node: Node<'_>) -> Option> { None } -pub fn parse(html: &str) { - let document = Document::from(html); +fn parse_resource_items( + resource_items: Vec, + recipe_type: RecipeType, + map_resource: &mut HashMap, +) -> Option { + let mut tmp_resource: ResourceTmp = (None, None, None, None, None); - parse_source(&document); - parse_dst(&document, "Herstellung", RecipeType::Production); - parse_dst(&document, "Raffination", RecipeType::Refining); - parse_dst(&document, "Raffination_2", RecipeType::Refining); - parse_dst(&document, "Kochen", RecipeType::Cooking); + let mut ingredient_to_add: Vec = Vec::new(); + let mut not_add = false; + + for item in resource_items.iter() { + match item { + ParseType::Link { url, title } => { + if tmp_resource.0.is_none() { + tmp_resource.0 = Some(title.to_string()); + } + if tmp_resource.1.is_none() { + tmp_resource.1 = Some(url.to_string()); + } + + // println!("Link: {} - {}", url, title); + } + ParseType::Img(icon) => { + if tmp_resource.2.is_none() { + tmp_resource.2 = Some(icon.clone()); + } + } + ParseType::Count(count) => { + if tmp_resource.3.is_none() { + tmp_resource.3 = Some(*count); + } + } + ParseType::Resource(resource) => { + // println!("Resource: {}", resource); + if tmp_resource.0.is_none() { + tmp_resource.0 = Some(resource.to_string()); + } + } + ParseType::ResourceAdd => { + if !not_add { + add(&tmp_resource, map_resource, &mut ingredient_to_add); + } + + not_add = false; + + tmp_resource = (None, None, None, None, None); // Reset for next resource + // println!("ResourceAdd"); + } + ParseType::ResourceLast => { + if !not_add { + add(&tmp_resource, map_resource, &mut ingredient_to_add); + } + + not_add = false; + + tmp_resource = (None, None, None, None, None); // Reset for next resource + // println!("ResourceLast"); + } + ParseType::Duration(duration) => { + // println!(">>> Duration: {} {}", duration, unit); + + if tmp_resource.4.is_none() { + tmp_resource.4 = Some(duration.clone()); + } + } + ParseType::DashDash => { + not_add = false; + tmp_resource = (None, None, None, None, None); // Reset for next resource + } + } + } + + let (_, ingredient) = create_resource_and_ingredient(&tmp_resource, map_resource); + + if !ingredient_to_add.is_empty() { + let recipe = crate::types::Recipe { + recipe_type: recipe_type, + resource: ingredient, + duration: tmp_resource.4.unwrap_or(Duration { + millis: 0, + unit: "Stück".to_string(), + }), + ingredients: ingredient_to_add, + }; + + return Some(recipe); + } + + None } -fn parse_dst(document: &Document, id: &str, recipe_type: RecipeType) -> bool { +fn print_recipe(recipes: &Vec, recipe_type: RecipeType) { + for recipe in recipes + .iter() + .filter(|recipe| recipe.recipe_type == recipe_type) + { + println!("Recipe Type: {:?}", recipe.recipe_type); + println!( + "Resource: {} ({})", + recipe.resource.resource.name, recipe.resource.quantity + ); + println!("Duration: {} ms", recipe.duration.millis); + println!("Ingredients:"); + for ingredient in &recipe.ingredients { + println!( + "- {} ({} x {})", + ingredient.resource.name, ingredient.quantity, ingredient.resource.title + ); + } + println!(); + } +} + +fn parse_dst( + document: &Document, + id: &str, + recipe_type: RecipeType, + map_resource: &mut HashMap, + recipes: &mut Vec, +) -> bool { let mut dest = document.find(Attr("id", id)); let dest = first(&mut dest); @@ -100,13 +263,19 @@ fn parse_dst(document: &Document, id: &str, recipe_type: RecipeType) -> bool { let li = elt_ul.find(Name("li")); for item in li { - parse_source_li_to_resource(&item, recipe_type.clone()); + if let Some(recipe) = parse_li_to_resource(&item, recipe_type.clone(), map_resource) { + recipes.push(recipe); + } } return true; } -fn parse_source(document: &Document) -> bool { +fn parse_source( + document: &Document, + map_resource: &mut HashMap, + recipes: &mut Vec, +) -> bool { let mut source = document.find(Attr("id", "Quelle")); let source = first(&mut source); @@ -145,27 +314,20 @@ fn parse_source(document: &Document) -> bool { let elt_ul = elt_ul.unwrap(); let li = elt_ul.find(Name("li")); - let mut recipes: Vec = Vec::new(); - for item in li { - if let Some(recipe) = parse_source_li_to_resource(&item, RecipeType::Refining) { + if let Some(recipe) = parse_li_to_resource(&item, RecipeType::Refining, map_resource) { recipes.push(recipe); } } - for recipe in recipes { - let name = recipe.resource.resource.name.clone(); - let title = recipe.resource.resource.title.clone(); - let url = recipe.resource.resource.url.clone(); - let icon = recipe.resource.resource.icon.clone(); - - println!("Resource: {} - {} - {:?} - {:?}", name, title, url, icon); - } - return true; } -fn parse_source_li_to_resource(item: &Node<'_>, recipe_type: RecipeType) -> Option { +fn parse_li_to_resource( + item: &Node<'_>, + recipe_type: RecipeType, + map_resource: &mut HashMap, +) -> Option { if item.children().next().is_none() { return None; } @@ -173,24 +335,27 @@ fn parse_source_li_to_resource(item: &Node<'_>, recipe_type: RecipeType) -> Opti let mut resource_items: Vec = Vec::new(); let selector = item.find(Or( - Or(Name("span"), Name("a")), - Or(Name("img"), Name("small")), + Name("strong"), + Or(Or(Name("span"), Name("a")), Or(Name("img"), Name("small"))), )); for child in selector { let name = child.name().unwrap(); if name == "a" && child.attr("href").is_some() && child.attr("title").is_some() { + let txt = get_text_next(&child); + resource_items.push(ParseType::Link { url: child.attr("href").unwrap().to_string(), title: child.attr("title").unwrap().to_string(), }); - let txt = get_text_next(&child); - if !txt.is_empty() { parse_text(&txt, &mut resource_items); } + if txt == "--" { + resource_items.push(ParseType::DashDash); + } } else if name == "img" && child.attr("data-src").is_some() && child.attr("width").is_some() @@ -211,112 +376,30 @@ fn parse_source_li_to_resource(item: &Node<'_>, recipe_type: RecipeType) -> Opti })); } else if name == "span" && !child.text().is_empty() - && child.parent().unwrap().name().unwrap() == "strong" + && (child.parent().unwrap().name().unwrap() == "strong" + || child.parent().unwrap().name().unwrap() == "span") { let txt = child.text().trim().to_string(); resource_items.push(ParseType::Resource(txt)); let txt = get_text_next(&child.parent().unwrap()); parse_text(&txt, &mut resource_items); - } else if name == "strong" && !child.text().is_empty() { + } else if name == "strong" { + // let txt = get_text_next(&child); + // parse_text(&txt, &mut resource_items); } else if name == "small" { let txt = get_text(&child); parse_text(&txt, &mut resource_items); } } - let mut tmp_resource: ( - Option, // name - Option, // title - Option, - Option, // count - Option, // duration - ) = (None, None, None, None, None); - - let mut map_resource: HashMap = HashMap::new(); - let mut ingredient_to_add: Vec = Vec::new(); - - for item in resource_items.iter() { - match item { - ParseType::Link { url, title } => { - if tmp_resource.0.is_none() { - tmp_resource.0 = Some(title.to_string()); - } - if tmp_resource.1.is_none() { - tmp_resource.1 = Some(url.to_string()); - } - - // println!("Link: {} - {}", url, title); - } - ParseType::Img(icon) => { - // println!( - // "Image: {} ({}x{}) - {}", - // icon.name, icon.width, icon.height, icon.url - // ); - - if (tmp_resource.2.is_none()) { - tmp_resource.2 = Some(icon.clone()); - } - } - ParseType::Count(count) => { - // println!("Count: {}", count); - - if (tmp_resource.3.is_none()) { - tmp_resource.3 = Some(*count); - } - } - ParseType::Resource(resource) => { - // println!("Resource: {}", resource); - if (tmp_resource.0.is_none()) { - tmp_resource.0 = Some(resource.to_string()); - } - } - ParseType::ResourceAdd => { - add(&tmp_resource, &mut map_resource, &mut ingredient_to_add); - tmp_resource = (None, None, None, None, None); // Reset for next resource - // println!("ResourceAdd"); - } - ParseType::ResourceLast => { - add(&tmp_resource, &mut map_resource, &mut ingredient_to_add); - tmp_resource = (None, None, None, None, None); // Reset for next resource - // println!("ResourceLast"); - } - ParseType::Duration { duration, unit } => { - // println!(">>> Duration: {} {}", duration, unit); - - if tmp_resource.4.is_none() { - tmp_resource.4 = Some(*duration); - } - } - } - } - - let (_, ingredient) = create_resource_and_ingredient(&tmp_resource, &mut map_resource); - - if !ingredient_to_add.is_empty() { - let recipe = crate::types::Recipe { - recipe_type: recipe_type, - resource: ingredient, - duration: tmp_resource.4.unwrap_or(0), - ingredients: ingredient_to_add, - }; - - return Some(recipe); - } - - None + parse_resource_items(resource_items, recipe_type, map_resource) } fn to_name(name: &str) -> String { normalize_text(name).replace(" ", "_").to_lowercase() } fn create_resource_and_ingredient( - tmp_resource: &( - Option, - Option, - Option, - Option, - Option, - ), + tmp_resource: &ResourceTmp, map_resource: &mut HashMap, ) -> (Resource, Ingredient) { let title = tmp_resource.0.as_ref().unwrap().clone(); @@ -351,14 +434,9 @@ fn create_resource_and_ingredient( (resource, ingredient) } + fn add( - tmp_resource: &( - Option, - Option, - Option, - Option, - Option, - ), + tmp_resource: &ResourceTmp, map_resource: &mut HashMap, ingredient_to_add: &mut Vec, ) { @@ -380,6 +458,12 @@ fn normalize_text(text: &str) -> String { .replace('\r', " ") .replace('\t', " "); + for c in text.clone().chars() { + if c.is_control() || c.is_whitespace() { + text = text.replace(c, " "); + } + } + while text.contains(" ") { text = text.replace(" ", " "); } @@ -444,6 +528,8 @@ fn get_text_next(node: &Node<'_>) -> String { if node.as_text().is_some() { text.push_str(node.as_text().unwrap().trim()); + } else if node.name().is_some() && node.name().unwrap() == "i" { + text.push_str(node.text().trim()); } else { break; } @@ -456,10 +542,12 @@ fn get_text_next(node: &Node<'_>) -> String { } fn parse_text(text: &str, resource_items: &mut Vec) { - let reg1 = Regex::new(r"^\s*x(?\d+)\s+(?[→+])\s*$").unwrap(); - let reg2 = Regex::new(r"^\s*\(.*(?\d+,\d+)\ssek\./(?\w+)\s*\)$").unwrap(); + let reg_count_next = Regex::new(r"^\s*x(?\d+)\s+(?[→+])\s*$").unwrap(); + let reg_count = Regex::new(r"^\s*x(?\d+)\s*.*$").unwrap(); + let reg_duration = + Regex::new(r"^.*\(.*(?\d+(|,\d+))\ssek\./(?\w+)\s*\)$").unwrap(); - if let Some(res) = reg1.captures(text) { + if let Some(res) = reg_count_next.captures(text) { let count = res.name("count").unwrap().as_str().parse().unwrap_or(0); resource_items.push(ParseType::Count(count)); @@ -475,324 +563,21 @@ fn parse_text(text: &str, resource_items: &mut Vec) { return; } - if let Some(res) = reg2.captures(text) { + if let Some(res) = reg_count.captures(text) { + let count = res.name("count").unwrap().as_str().parse().unwrap_or(0); + + resource_items.push(ParseType::Count(count)); + } + + if let Some(res) = reg_duration.captures(text) { let duration_str = res.name("duration").unwrap().as_str(); let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0); let unit = res.name("unit").unwrap().as_str().to_string(); let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds - resource_items.push(ParseType::Duration { duration, unit }); + resource_items.push(ParseType::Duration(Duration { + millis: duration, + unit, + })); } } - -// pub fn parse1(html: &str) -> Result> { -// // Parse the HTML content -// let document = scraper::Html::parse_document(html); -// let selector_usage = scraper::Selector::parse("#Verwendung").unwrap(); -// let selector_li = scraper::Selector::parse("li").unwrap(); - -// parse_source(&document); - -// let elt_usage = document.select(&selector_usage).next(); - -// if elt_usage.is_none() { -// eprintln!("No element found with the selector '#Verwendung'"); -// return Ok(false); -// } - -// let elt_usage = elt_usage.unwrap(); -// let mut elt_usage = elt_usage.parent().unwrap(); - -// let mut elt_ul = None; - -// while elt_usage.next_sibling().is_some() { -// elt_usage = elt_usage.next_sibling().unwrap(); - -// if !elt_usage.value().is_element() { -// continue; -// } - -// let elt = elt_usage.value().as_element().unwrap(); -// if elt.name() == "ul" { -// elt_ul = Some(elt_usage); -// break; -// } - -// if elt.name() == "h2" { -// eprintln!("Found 'h2' element, stopping search for 'ul'"); -// break; -// } -// } - -// if elt_ul.is_none() { -// eprintln!("No second 'ul' element found after '#Verwendung'"); -// return Ok(false); -// } -// let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap(); -// let li = elt_ul.select(&selector_li); - -// for item in li { -// parse_source_li_to_resource(item); -// } -// Ok(true) -// } - -// fn parse_source(document: &Html) -> bool { -// let selector_source = scraper::Selector::parse("#Quelle").unwrap(); -// let selector_li = scraper::Selector::parse("li").unwrap(); -// let elt_source = document.select(&selector_source).next(); - -// if elt_source.is_none() { -// eprintln!("No element found with the selector '#Quelle'"); -// return false; -// } - -// let elt_source = elt_source.unwrap(); - -// let mut elt_source = elt_source.parent().unwrap(); - -// let mut c = 0; -// let mut elt_ul = None; - -// while elt_source.next_sibling().is_some() { -// elt_source = elt_source.next_sibling().unwrap(); - -// if !elt_source.value().is_element() { -// continue; -// } - -// let elt = elt_source.value().as_element().unwrap(); -// if elt.name() == "ul" { -// c += 1; - -// if c > 1 { -// elt_ul = Some(elt_source); -// break; -// } -// } - -// if elt.name() == "h2" { -// eprintln!("Found 'h2' element, stopping search for 'ul'"); -// break; -// } -// } - -// if elt_ul.is_none() { -// eprintln!("No second 'ul' element found after '#Quelle'"); -// return false; -// } -// let elt_ul = ElementRef::wrap(elt_ul.unwrap()).unwrap(); -// let li = elt_ul.select(&selector_li); - -// for item in li { -// parse_source_li_to_resource(item); -// } - -// return true; -// } - -// fn parse_source_li_to_resource(item: ElementRef<'_>) { -// if !item.has_children() { -// println!("Item has no children, skipping."); -// return; -// } - -// let mut resource_items: Vec = Vec::new(); - -// let selector = scraper::Selector::parse("span,a,img,small").unwrap(); -// let iter = item.select(&selector); - -// for child in iter { -// let elem = child.value(); -// let name = elem.name(); - -// if name == "a" -// && elem.attrs().find(|attr| attr.0 == "href").is_some() -// && elem.attrs().find(|attr| attr.0 == "title").is_some() -// { -// resource_items.push(ParseType::Link { -// url: elem -// .attrs() -// .find(|attr| attr.0 == "href") -// .unwrap() -// .1 -// .to_string(), -// title: elem -// .attrs() -// .find(|attr| attr.0 == "title") -// .unwrap() -// .1 -// .to_string(), -// }); - -// let txt = get_text_next(&child); - -// if !txt.is_empty() { -// parse_text(&txt, &mut resource_items); -// } -// } else if name == "img" -// && elem.attrs().find(|attr| attr.0 == "data-src").is_some() -// && elem.attrs().find(|attr| attr.0 == "width").is_some() -// && elem.attrs().find(|attr| attr.0 == "height").is_some() -// && elem -// .attrs() -// .find(|attr| attr.0 == "data-image-name") -// .is_some() -// { -// let url = elem.attrs().find(|attr| attr.0 == "data-src").unwrap().1; -// let name = elem -// .attrs() -// .find(|attr| attr.0 == "data-image-name") -// .unwrap() -// .1; -// let width = elem -// .attrs() -// .find(|attr| attr.0 == "width") -// .unwrap() -// .1 -// .parse() -// .unwrap_or(0); -// let height = elem -// .attrs() -// .find(|attr| attr.0 == "height") -// .unwrap() -// .1 -// .parse() -// .unwrap_or(0); - -// resource_items.push(ParseType::Img(Icon { -// name: name.to_string(), -// url: url.to_string(), -// width, -// height, -// content_type: "image/png".to_string(), // Assuming PNG, adjust as needed -// })); -// } else if name == "small" { -// let txt = get_text(&child); -// parse_text(&txt, &mut resource_items); -// } -// } -// for item in resource_items.iter() { -// match item { -// ParseType::Link { url, title } => { -// println!("Link: {} - {}", url, title); -// } -// ParseType::Img(icon) => { -// println!( -// "Image: {} ({}x{}) - {}", -// icon.name, icon.width, icon.height, icon.url -// ); -// } -// ParseType::Count(count) => { -// println!("Count: {}", count); -// } -// ParseType::ResourceAdd => { -// println!("ResourceAdd") -// } -// ParseType::ResourceLast => { -// println!("ResourceLast") -// } -// ParseType::Duration { duration, unit } => { -// println!(">>> Duration: {} {}", duration, unit) -// } -// } -// } -// println!("======================"); -// } - -// fn parse_text(text: &str, resource_items: &mut Vec) { -// let reg1 = Regex::new(r"^\s*x(?\d+)\s+(?[→+])\s*$").unwrap(); -// let reg2 = Regex::new(r"^\s*\(.*(?\d+,\d+)\ssek\./(?\w+)\s*\)$").unwrap(); - -// if let Some(res) = reg1.captures(text) { -// let count = res.name("count").unwrap().as_str().parse().unwrap_or(0); - -// resource_items.push(ParseType::Count(count)); - -// let end = res.name("end").unwrap().as_str().to_string(); - -// if end == "+" { -// resource_items.push(ParseType::ResourceAdd); -// } else { -// resource_items.push(ParseType::ResourceLast); -// } - -// return; -// } - -// if let Some(res) = reg2.captures(text) { -// let duration_str = res.name("duration").unwrap().as_str(); -// let duration: f64 = duration_str.replace(',', ".").parse().unwrap_or(0.0); -// let unit = res.name("unit").unwrap().as_str().to_string(); -// let duration: u64 = (duration * 1000.0) as u64; // Convert to milliseconds - -// resource_items.push(ParseType::Duration { duration, unit }); -// } -// } - -// fn get_text(node: &NodeRef<'_, Node>) -> String { -// let mut text = String::new(); - -// println!("{:?}", node.value()); - -// if node.value().is_text() { -// text.push_str(node.value().as_text().unwrap().to_string().as_str()); -// } - -// if node.has_children() { -// for child in node.descendants() { -// if (child.value().is_text()) { -// text.push_str(child.value().as_text().unwrap().to_string().as_str()); -// } -// } -// } - -// if node.has_siblings() { -// let mut next = node.next_sibling(); - -// while next.is_some() { -// let next_node = next.unwrap(); -// if next_node.value().is_text() { -// text.push_str(next_node.value().as_text().unwrap()); -// } else { -// break; -// } -// next = next_node.next_sibling(); -// } -// } - -// return text; -// } - -// fn get_text_next(node: &NodeRef<'_, Node>) -> String { -// if node.value().is_text() { -// return node.value().as_text().unwrap().text.trim().to_string(); -// } -// let next = node.next_sibling(); - -// if !next.is_some() { -// return String::new(); -// } - -// let next = next.unwrap(); - -// if next.value().is_text() { -// let mut text = next.value().as_text().unwrap().text.trim().to_string(); -// let mut next = next.next_sibling(); - -// while (next.is_some()) { -// let node = next.unwrap(); -// next = node.next_sibling(); - -// if node.value().is_text() { -// text.push_str(node.value().as_text().unwrap().text.trim()); -// } else { -// break; -// } -// } - -// return text; -// } - -// String::new() -// } diff --git a/src/types/types.rs b/src/types/types.rs index abfd90f..57a4c70 100644 --- a/src/types/types.rs +++ b/src/types/types.rs @@ -45,10 +45,17 @@ pub enum RecipeType { Refining, Cooking, } + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Duration { + pub millis: u64, + pub unit: String, +} + #[derive(Debug, PartialEq, Eq, Clone)] pub struct Recipe { pub recipe_type: RecipeType, pub resource: Ingredient, - pub duration: u64, + pub duration: Duration, pub ingredients: Vec, }