diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-27 20:54:17 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-27 20:54:17 +0100 |
commit | 64568528b13d08ceaa0c36c20b3aa20d966cfdcb (patch) | |
tree | f2f2ac5d6a78b74a1e9ea740215f9a43b2d23364 /src/format.rs | |
parent | 12690a6afef96aa165f56762689fca682b76f9a0 (diff) | |
download | datagengo-64568528b13d08ceaa0c36c20b3aa20d966cfdcb.tar.gz datagengo-64568528b13d08ceaa0c36c20b3aa20d966cfdcb.zip |
furigana: handle reb as #number_in_jmdict, and check that result at least has correct text under furigana
Diffstat (limited to 'src/format.rs')
-rw-r--r-- | src/format.rs | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/src/format.rs b/src/format.rs index 83c63a1..a556677 100644 --- a/src/format.rs +++ b/src/format.rs @@ -223,6 +223,21 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { } let reb = match reb { + Some(reb) if reb.starts_with('#') => { + let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); + if let Some(ent) = ents.iter().find(|ent| { + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() + }) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + reb.text().unwrap().trim() + } else { + println!("- entry id not found: {}", reb); + ret += &word; + continue; + } + } Some(reb) => reb, None => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); @@ -333,6 +348,15 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { ret += &word_suffix; } + ret += remainder; + + // CHECK + let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); + let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); + if ex.ja != back_to_ja { + eprintln!("!!!! {} != {}", ex.ja, back_to_ja); + } + ret } |