aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-27 20:54:17 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-27 20:54:17 +0100
commit64568528b13d08ceaa0c36c20b3aa20d966cfdcb (patch)
treef2f2ac5d6a78b74a1e9ea740215f9a43b2d23364
parent12690a6afef96aa165f56762689fca682b76f9a0 (diff)
downloaddatagengo-64568528b13d08ceaa0c36c20b3aa20d966cfdcb.tar.gz
datagengo-64568528b13d08ceaa0c36c20b3aa20d966cfdcb.zip
furigana: handle reb as #number_in_jmdict, and check that result at least has correct text under furigana
-rw-r--r--data/batches.json2
-rw-r--r--src/format.rs24
2 files changed, 25 insertions, 1 deletions
diff --git a/data/batches.json b/data/batches.json
index d2ed3e1..194372b 100644
--- a/data/batches.json
+++ b/data/batches.json
@@ -63366,4 +63366,4 @@
}
]
}
-] \ No newline at end of file
+]
diff --git a/src/format.rs b/src/format.rs
index 83c63a1..a556677 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -223,6 +223,21 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String {
}
let reb = match reb {
+ Some(reb) if reb.starts_with('#') => {
+ let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
+ if let Some(ent) = ents.iter().find(|ent| {
+ let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
+ ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap()
+ }) {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+ reb.text().unwrap().trim()
+ } else {
+ println!("- entry id not found: {}", reb);
+ ret += &word;
+ continue;
+ }
+ }
Some(reb) => reb,
None => {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
@@ -333,6 +348,15 @@ fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String {
ret += &word_suffix;
}
+ ret += remainder;
+
+ // CHECK
+ let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap();
+ let back_to_ja = re.replace_all(&ret, "").replace("[[", "");
+ if ex.ja != back_to_ja {
+ eprintln!("!!!! {} != {}", ex.ja, back_to_ja);
+ }
+
ret
}