From a359c9da4ed354f0b0061be88a2376fb34d6348f Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 21 Jul 2023 22:17:50 +0200 Subject: filter dictionnary entries to limit to those with correct reading --- src/main.rs | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/main.rs b/src/main.rs index e081454..8a74d82 100644 --- a/src/main.rs +++ b/src/main.rs @@ -356,19 +356,20 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: writeln!(f, r#"
Explanation"#)?; let mut expl_batch = Vec::new(); let mut expl_all = Vec::new(); - for w in ex.expl.split(|c| c == ' ' || c == '~') { - let w = expl_clean_word(w); - let wchars = Charset::new(w); + for word in ex.expl.split(|c| c == ' ' || c == '~') { + let (keb, reb) = expl_clean_word(word); + let wchars = Charset::new(keb); if !wchars.intersects(&ex.chars) { continue; } - if let Some(ents) = dict_idx.get(w) { + if let Some(ents) = dict_idx.get(keb) { for ent in ents.iter() { - let s = dict_str(w, ent); - if wchars.intersects(&batch.chars) { - expl_batch.push(s); - } else { - expl_all.push(s); + if let Some(s) = dict_str(keb, reb, ent) { + if wchars.intersects(&batch.chars) { + expl_batch.push(s); + } else { + expl_all.push(s); + } } } } @@ -392,21 +393,29 @@ fn format_batch_aux<'a>(dict_idx: &DictIndex<'a>, count: usize, i: usize, batch: Ok(()) } -fn expl_clean_word(w: &str) -> &str { +fn expl_clean_word(w: &str) -> (&str, Option<&str>) { let mut ret = w; for delim in ['(', '{', '['] { if let Some((s, _)) = ret.split_once(delim) { ret = s; } } - ret + let p = w.split_once('(') + .and_then(|(_, r)| r.split_once(')')) + .map(|(p, _)| p); + (ret, p) } -fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String { +fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option { let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + let reb = reb.text().unwrap().trim(); - let mut ret = format!("{} [{}]", w, reb.text().unwrap().trim()); + if qreb.map(|x| x != reb).unwrap_or(false) { + return None; + } + + let mut ret = format!("{} [{}]", qkeb, reb); for sense in ent.children().filter(|x| x.has_tag_name("sense")) { if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { @@ -417,7 +426,7 @@ fn dict_str<'a>(w: &str, ent: &roxmltree::Node<'a, 'a>) -> String { if ret.chars().rev().next() == Some(';') { ret.pop(); } - ret + Some(ret) } fn format_index(batches: &[Batch], kanji_levels: &[(String, String)]) -> Result<()> { -- cgit v1.2.3