diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-27 20:03:08 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-27 20:03:08 +0100 |
commit | 061d9d3cddbd2673e4601c335b362bb1435b59b8 (patch) | |
tree | f3f1e21afb368e177cb0e333802bef40a12541ab /src/format.rs | |
parent | d2a46c25219c21ac4f128da8512302935654d38e (diff) | |
download | datagengo-061d9d3cddbd2673e4601c335b362bb1435b59b8.tar.gz datagengo-061d9d3cddbd2673e4601c335b362bb1435b59b8.zip |
use furigana for extra examples
Diffstat (limited to 'src/format.rs')
-rw-r--r-- | src/format.rs | 181 |
1 files changed, 157 insertions, 24 deletions
diff --git a/src/format.rs b/src/format.rs index 1cdde1b..88c81c3 100644 --- a/src/format.rs +++ b/src/format.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs; use anyhow::Result; @@ -142,32 +143,19 @@ fn format_batch_aux<'a>( r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# )?; for ex in batch.extra_examples.iter() { - let mut expl1 = Vec::new(); - let mut expl2 = Vec::new(); - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let wchars = Charset::new(keb); - if !wchars.intersects(&ex.chars) { - continue; - } - if let Some(ents) = dict_idx.get(keb) { - for ent in ents.iter() { - if let Some(s) = dict_str_short(keb, reb, ent) { - if wchars.intersects(&batch.chars) { - expl1.push(s); - } else { - expl2.push(s); - } - } - } - } - } - expl1.extend(expl2.into_iter()); - let expl = expl1.join("<br />"); + let furi = format_ex_furigana(dict_idx, ex); + // println!( + // "FURIGANA: {}\n => {}", + // ex.ja, + // format_ex_furigana(dict_idx, ex) + // ); writeln!( f, - r#"<tr><td><details><summary class="tab_large2 font_ja"> {} </summary><div style="text-align: center">{}<br />{}</div></details></td></tr>"#, - ex.ja, ex.en, expl + r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#, + furi.replace("[[", "<ruby>") + .replace("||", "<rt>") + .replace("]]", "</rt></ruby>"), + ex.en )?; } writeln!(f, r#"</table></details>"#)?; @@ -199,6 +187,151 @@ fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> Ok(()) } +fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { + use std::fmt::Write; + + let mut remainder = ex.ja.as_str(); + let mut ret = String::new(); + + for word in ex.expl.split(|c| c == ' ' || c == '~') { + let (keb, reb) = expl_clean_word(word); + let word = word + .split_once('{') + .and_then(|(_, r)| r.split_once('}')) + .map(|(p, _)| p) + .unwrap_or(keb); + + if let Some(i) = remainder.find(word) { + ret += &remainder[..i]; + remainder = &remainder[i..]; + } + + let mut new_word = String::new(); + for c in word.chars() { + if remainder.starts_with(c) { + remainder = remainder.strip_prefix(c).unwrap(); + new_word.push(c); + } else { + eprintln!("!!!! Char {} is not in remainder !!!!", c); + } + } + let word = &new_word; + + if !Charset::new(word).intersects(&ex.chars) { + ret += word; + continue; + } + + let reb = match reb { + Some(reb) => reb, + None => { + let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); + let matches = ents + .iter() + .map(|ent| { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + reb.text().unwrap().trim() + }) + .collect::<HashSet<_>>(); + if matches.len() == 1 { + *matches.iter().next().unwrap() + } else { + println!("- word without reb: {}", word); + ret += &word; + continue; + } + } + }; + + //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb); + let common_cnt = word + .chars() + .zip(keb.chars()) + .take_while(|(x, y)| x == y) + .count(); + if common_cnt == 0 { + // Strange cases + write!(&mut ret, "[[{}||{}]]", word, reb).unwrap(); + continue; + } + + let keb_suffix = keb.chars().skip(common_cnt).collect::<String>(); + let reb = reb.strip_suffix(&keb_suffix).unwrap_or(reb); + //println!(" >> common reb: {}, common_word: {}", reb, word.chars().take(common_cnt).collect::<String>()); + + let wchars = Vec::from_iter(word.chars().take(common_cnt)); + let rchars = Vec::from_iter(reb.chars()); + + // We shall invoke Levhenstein distance + let mut dynrow0 = vec![(0, 0, 0, false)]; + for ri in 0..rchars.len() { + dynrow0.push((0, ri, 100 + ri + 1, false)); + } + let mut dyntab = vec![dynrow0]; + + for (wi, wc) in wchars.iter().enumerate() { + let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)]; + + for (ri, rc) in rchars.iter().enumerate() { + let mut x = vec![]; + if dyntab[wi][ri + 1].3 { + x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true)); + } + if dynrow[ri].3 { + x.push((wi + 1, ri, dynrow[ri].2 + 1, true)); + } + if wc == rc { + x.push((wi, ri, dyntab[wi][ri].2, false)); + } else { + x.push((wi, ri, dyntab[wi][ri].2 + 1, true)); + } + dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap()); + } + dyntab.push(dynrow); + } + //eprintln!("DYN TAB: {:?}", dyntab); + + let mut path = vec![(wchars.len(), rchars.len())]; + loop { + let (wi, ri) = *path.last().unwrap(); + let (wi2, ri2, _, _) = dyntab[wi][ri]; + path.push((wi2, ri2)); + if wi2 == 0 && ri2 == 0 { + break; + } + } + path.reverse(); + //eprintln!("DYN PATH: {:?}", path); + + let mut wbuf = String::new(); + let mut rbuf = String::new(); + for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) { + if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] { + if !wbuf.is_empty() || !rbuf.is_empty() { + write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); + wbuf.clear(); + rbuf.clear(); + } + ret.push(wchars[wi1]); + } else { + if wi2 > wi1 { + wbuf.push(wchars[wi1]); + } + if ri2 > ri1 { + rbuf.push(rchars[ri1]); + } + } + } + if !wbuf.is_empty() || !rbuf.is_empty() { + write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); + } + + ret.extend(word.chars().skip(common_cnt)); + } + ret +} + fn expl_clean_word(w: &str) -> (&str, Option<&str>) { let mut ret = w; for delim in ['(', '{', '['] { |