aboutsummaryrefslogtreecommitdiff
path: root/src/format.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-11-27 20:03:08 +0100
committerAlex Auvolat <alex@adnab.me>2023-11-27 20:03:08 +0100
commit061d9d3cddbd2673e4601c335b362bb1435b59b8 (patch)
treef3f1e21afb368e177cb0e333802bef40a12541ab /src/format.rs
parentd2a46c25219c21ac4f128da8512302935654d38e (diff)
downloaddatagengo-061d9d3cddbd2673e4601c335b362bb1435b59b8.tar.gz
datagengo-061d9d3cddbd2673e4601c335b362bb1435b59b8.zip
use furigana for extra examples
Diffstat (limited to 'src/format.rs')
-rw-r--r--src/format.rs181
1 files changed, 157 insertions, 24 deletions
diff --git a/src/format.rs b/src/format.rs
index 1cdde1b..88c81c3 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
use std::fs;
use anyhow::Result;
@@ -142,32 +143,19 @@ fn format_batch_aux<'a>(
r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#
)?;
for ex in batch.extra_examples.iter() {
- let mut expl1 = Vec::new();
- let mut expl2 = Vec::new();
- for word in ex.expl.split(|c| c == ' ' || c == '~') {
- let (keb, reb) = expl_clean_word(word);
- let wchars = Charset::new(keb);
- if !wchars.intersects(&ex.chars) {
- continue;
- }
- if let Some(ents) = dict_idx.get(keb) {
- for ent in ents.iter() {
- if let Some(s) = dict_str_short(keb, reb, ent) {
- if wchars.intersects(&batch.chars) {
- expl1.push(s);
- } else {
- expl2.push(s);
- }
- }
- }
- }
- }
- expl1.extend(expl2.into_iter());
- let expl = expl1.join("<br />");
+ let furi = format_ex_furigana(dict_idx, ex);
+ // println!(
+ // "FURIGANA: {}\n => {}",
+ // ex.ja,
+ // format_ex_furigana(dict_idx, ex)
+ // );
writeln!(
f,
- r#"<tr><td><details><summary class="tab_large2 font_ja">&nbsp;&nbsp;{}&nbsp;&nbsp;</summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#,
- ex.ja, ex.en, expl
+ r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#,
+ furi.replace("[[", "<ruby>")
+ .replace("||", "<rt>")
+ .replace("]]", "</rt></ruby>"),
+ ex.en
)?;
}
writeln!(f, r#"</table></details>"#)?;
@@ -199,6 +187,151 @@ fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()>
Ok(())
}
+fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String {
+ use std::fmt::Write;
+
+ let mut remainder = ex.ja.as_str();
+ let mut ret = String::new();
+
+ for word in ex.expl.split(|c| c == ' ' || c == '~') {
+ let (keb, reb) = expl_clean_word(word);
+ let word = word
+ .split_once('{')
+ .and_then(|(_, r)| r.split_once('}'))
+ .map(|(p, _)| p)
+ .unwrap_or(keb);
+
+ if let Some(i) = remainder.find(word) {
+ ret += &remainder[..i];
+ remainder = &remainder[i..];
+ }
+
+ let mut new_word = String::new();
+ for c in word.chars() {
+ if remainder.starts_with(c) {
+ remainder = remainder.strip_prefix(c).unwrap();
+ new_word.push(c);
+ } else {
+ eprintln!("!!!! Char {} is not in remainder !!!!", c);
+ }
+ }
+ let word = &new_word;
+
+ if !Charset::new(word).intersects(&ex.chars) {
+ ret += word;
+ continue;
+ }
+
+ let reb = match reb {
+ Some(reb) => reb,
+ None => {
+ let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
+ let matches = ents
+ .iter()
+ .map(|ent| {
+ let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+ let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+ reb.text().unwrap().trim()
+ })
+ .collect::<HashSet<_>>();
+ if matches.len() == 1 {
+ *matches.iter().next().unwrap()
+ } else {
+ println!("- word without reb: {}", word);
+ ret += &word;
+ continue;
+ }
+ }
+ };
+
+ //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb);
+ let common_cnt = word
+ .chars()
+ .zip(keb.chars())
+ .take_while(|(x, y)| x == y)
+ .count();
+ if common_cnt == 0 {
+ // Strange cases
+ write!(&mut ret, "[[{}||{}]]", word, reb).unwrap();
+ continue;
+ }
+
+ let keb_suffix = keb.chars().skip(common_cnt).collect::<String>();
+ let reb = reb.strip_suffix(&keb_suffix).unwrap_or(reb);
+ //println!(" >> common reb: {}, common_word: {}", reb, word.chars().take(common_cnt).collect::<String>());
+
+ let wchars = Vec::from_iter(word.chars().take(common_cnt));
+ let rchars = Vec::from_iter(reb.chars());
+
+ // We shall invoke Levhenstein distance
+ let mut dynrow0 = vec![(0, 0, 0, false)];
+ for ri in 0..rchars.len() {
+ dynrow0.push((0, ri, 100 + ri + 1, false));
+ }
+ let mut dyntab = vec![dynrow0];
+
+ for (wi, wc) in wchars.iter().enumerate() {
+ let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)];
+
+ for (ri, rc) in rchars.iter().enumerate() {
+ let mut x = vec![];
+ if dyntab[wi][ri + 1].3 {
+ x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true));
+ }
+ if dynrow[ri].3 {
+ x.push((wi + 1, ri, dynrow[ri].2 + 1, true));
+ }
+ if wc == rc {
+ x.push((wi, ri, dyntab[wi][ri].2, false));
+ } else {
+ x.push((wi, ri, dyntab[wi][ri].2 + 1, true));
+ }
+ dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap());
+ }
+ dyntab.push(dynrow);
+ }
+ //eprintln!("DYN TAB: {:?}", dyntab);
+
+ let mut path = vec![(wchars.len(), rchars.len())];
+ loop {
+ let (wi, ri) = *path.last().unwrap();
+ let (wi2, ri2, _, _) = dyntab[wi][ri];
+ path.push((wi2, ri2));
+ if wi2 == 0 && ri2 == 0 {
+ break;
+ }
+ }
+ path.reverse();
+ //eprintln!("DYN PATH: {:?}", path);
+
+ let mut wbuf = String::new();
+ let mut rbuf = String::new();
+ for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) {
+ if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] {
+ if !wbuf.is_empty() || !rbuf.is_empty() {
+ write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
+ wbuf.clear();
+ rbuf.clear();
+ }
+ ret.push(wchars[wi1]);
+ } else {
+ if wi2 > wi1 {
+ wbuf.push(wchars[wi1]);
+ }
+ if ri2 > ri1 {
+ rbuf.push(rchars[ri1]);
+ }
+ }
+ }
+ if !wbuf.is_empty() || !rbuf.is_empty() {
+ write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
+ }
+
+ ret.extend(word.chars().skip(common_cnt));
+ }
+ ret
+}
+
fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
let mut ret = w;
for delim in ['(', '{', '['] {