From 64568528b13d08ceaa0c36c20b3aa20d966cfdcb Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 27 Nov 2023 20:54:17 +0100 Subject: furigana: handle reb as #number_in_jmdict, and check that result at least has correct text under furigana --- data/batches.json | 2 +- src/format.rs | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/data/batches.json b/data/batches.json index d2ed3e1..194372b 100644 --- a/data/batches.json +++ b/data/batches.json @@ -63366,4 +63366,4 @@ } ] } -] \ No newline at end of file +] diff --git a/src/format.rs b/src/format.rs index 83c63a1..a556677 100644 --- a/src/format.rs +++ b/src/format.rs @@ -223,6 +223,21 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { } let reb = match reb { + Some(reb) if reb.starts_with('#') => { + let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); + if let Some(ent) = ents.iter().find(|ent| { + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() + }) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + reb.text().unwrap().trim() + } else { + println!("- entry id not found: {}", reb); + ret += &word; + continue; + } + } Some(reb) => reb, None => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); @@ -333,6 +348,15 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { ret += &word_suffix; } + ret += remainder; + + // CHECK + let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); + let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); + if ex.ja != back_to_ja { + eprintln!("!!!! {} != {}", ex.ja, back_to_ja); + } + ret } -- cgit v1.2.3