diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:22:16 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-28 16:22:16 +0100 |
commit | b78034ad5bf65f1dfe390861f72bed827e2ab1b8 (patch) | |
tree | aa42b4edc8d2a44bb5c41890089cb754ca7d40d7 /src/format.rs | |
parent | 64568528b13d08ceaa0c36c20b3aa20d966cfdcb (diff) | |
download | datagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.tar.gz datagengo-b78034ad5bf65f1dfe390861f72bed827e2ab1b8.zip |
add furigana to main examples and persist furigana in batches.json
Diffstat (limited to 'src/format.rs')
-rw-r--r-- | src/format.rs | 233 |
1 files changed, 24 insertions, 209 deletions
diff --git a/src/format.rs b/src/format.rs index a556677..caed70a 100644 --- a/src/format.rs +++ b/src/format.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; use std::fs; use anyhow::Result; use crate::charset::Charset; +use crate::example::expl_clean_word; use crate::*; // ===================================================================== @@ -62,16 +62,26 @@ fn format_batch_aux<'a>( for ex in batch.examples.iter() { writeln!(f, "<hr />")?; - write!(f, r#"<p class="ja">"#)?; - for c in ex.ja.chars() { - if batch.chars.contains(c) { - write!(f, r#"<span class="char_cur">{}</span>"#, c)?; + write!(f, r#"<p class="ja ja_main">"#)?; + let furi = ex.furigana_markup(); + for c in furi.chars() { + let class = if batch.chars.contains(c) { + Some("char_cur") } else if batch.chars_p1.contains(c) { - write!(f, r#"<span class="char_p1">{}</span>"#, c)?; + Some("char_p1") } else if batch.chars_p2.contains(c) { - write!(f, r#"<span class="char_p2">{}</span>"#, c)?; + Some("char_p2") } else if batch.chars_bad.contains(c) { - write!(f, r#"<span class="char_bad">{}</span>"#, c)?; + Some("char_bad") + } else { + None + }; + if let Some(cls) = class { + write!( + f, + r#"<a href="https://jisho.org/search/{}%20%23kanji" class="{}">{}</a>"#, + c, cls, c + )?; } else { write!(f, "{}", c)?; } @@ -140,25 +150,17 @@ fn format_batch_aux<'a>( writeln!( f, - r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"# + r#"<p><strong>Extra examples (reading practice)</strong></p><table class="extratable">"# )?; for ex in batch.extra_examples.iter() { - let furi = format_ex_furigana(dict_idx, ex); - // println!( - // "FURIGANA: {}\n => {}", - // ex.ja, - // format_ex_furigana(dict_idx, ex) - // ); writeln!( f, r#"<tr><td><div class="extra_example"><div class="extra_ja font_ja">{}</div><div class="extra_en">{}</div></div></td></tr>"#, - furi.replace("[[", "<ruby>") - .replace("||", "<rt>") - .replace("]]", "</rt></ruby>"), + ex.furigana_markup(), ex.en )?; } - writeln!(f, r#"</table></details>"#)?; + writeln!(f, r#"</table>"#)?; writeln!(f, "<hr />")?; writeln!(f, "<p>\(≧▽≦)/</p>")?; @@ -172,208 +174,21 @@ fn format_vocab(f: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<()> if !vocab.is_empty() { writeln!( f, - r#"<details><summary>{}</summary><table class="vocabtable">"#, + r#"<p><strong>{}</strong></p><table class="vocabtable">"#, t )?; for v in vocab { writeln!( f, - r#"<tr><td>{}</td><td> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja">{}</td></tr>"#, + r#"<tr><td>{}</td><td style="word-break: keep-all"> <span class="tab_large font_ja">{}</span> </td><td>{}</td><td class="font_ja" style="word-break: keep-all">{}</td></tr>"#, v.level, v.kanji, v.en, v.kana )?; } - writeln!(f, "</table></details>")?; + writeln!(f, "</table>")?; } Ok(()) } -fn format_ex_furigana<'a>(dict_idx: &DictIndex<'a>, ex: &Example) -> String { - use std::fmt::Write; - - let mut remainder = ex.ja.as_str(); - let mut ret = String::new(); - - for word in ex.expl.split(|c| c == ' ' || c == '~') { - let (keb, reb) = expl_clean_word(word); - let word = word - .split_once('{') - .and_then(|(_, r)| r.split_once('}')) - .map(|(p, _)| p) - .unwrap_or(keb); - - if let Some(i) = remainder.find(word) { - ret += &remainder[..i]; - remainder = &remainder[i..]; - } - - let mut new_word = String::new(); - for c in word.chars() { - if remainder.starts_with(c) { - remainder = remainder.strip_prefix(c).unwrap(); - new_word.push(c); - } else { - eprintln!("!!!! Char {} is not in remainder !!!!", c); - } - } - let word = &new_word; - - if !Charset::new(word).intersects(&ex.chars) { - ret += word; - continue; - } - - let reb = match reb { - Some(reb) if reb.starts_with('#') => { - let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - if let Some(ent) = ents.iter().find(|ent| { - let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() - }) { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - } else { - println!("- entry id not found: {}", reb); - ret += &word; - continue; - } - } - Some(reb) => reb, - None => { - let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - let matches = ents - .iter() - .map(|ent| { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - }) - .collect::<HashSet<_>>(); - if matches.len() == 1 { - *matches.iter().next().unwrap() - } else { - println!("- word without reb: {}", word); - ret += &word; - continue; - } - } - }; - - //println!("+ word: {}, keb: {}, reb: {}", word, keb, reb); - let common_cnt = word - .chars() - .zip(keb.chars()) - .take_while(|(x, y)| x == y) - .count(); - if common_cnt == 0 { - // Strange cases - write!(&mut ret, "[[{}||{}]]", word, reb).unwrap(); - continue; - } - - let keb_suffix = keb.chars().skip(common_cnt).collect::<String>(); - let word_suffix = word.chars().skip(common_cnt).collect::<String>(); - let reb = reb - .strip_suffix(&keb_suffix) - .or(reb.strip_suffix(&word_suffix)) - .unwrap_or(reb); - //println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>()); - - let wchars = Vec::from_iter(word.chars().take(common_cnt)); - let rchars = Vec::from_iter(reb.chars()); - - // We shall invoke Levhenstein distance - let mut dynrow0 = vec![(0, 0, 0, false)]; - for ri in 0..rchars.len() { - dynrow0.push((0, ri, 100 + ri + 1, false)); - } - let mut dyntab = vec![dynrow0]; - - for (wi, wc) in wchars.iter().enumerate() { - let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)]; - - for (ri, rc) in rchars.iter().enumerate() { - let mut x = vec![]; - if dyntab[wi][ri + 1].3 { - x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true)); - } - if dynrow[ri].3 { - x.push((wi + 1, ri, dynrow[ri].2 + 1, true)); - } - if wc == rc { - x.push((wi, ri, dyntab[wi][ri].2, false)); - } else { - x.push((wi, ri, dyntab[wi][ri].2 + 1, true)); - } - dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap()); - } - dyntab.push(dynrow); - } - //eprintln!("DYN TAB: {:?}", dyntab); - - let mut path = vec![(wchars.len(), rchars.len())]; - loop { - let (wi, ri) = *path.last().unwrap(); - let (wi2, ri2, _, _) = dyntab[wi][ri]; - path.push((wi2, ri2)); - if wi2 == 0 && ri2 == 0 { - break; - } - } - path.reverse(); - //eprintln!("DYN PATH: {:?}", path); - - let mut wbuf = String::new(); - let mut rbuf = String::new(); - for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) { - if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] { - if !wbuf.is_empty() || !rbuf.is_empty() { - write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); - wbuf.clear(); - rbuf.clear(); - } - ret.push(wchars[wi1]); - } else { - if wi2 > wi1 { - wbuf.push(wchars[wi1]); - } - if ri2 > ri1 { - rbuf.push(rchars[ri1]); - } - } - } - if !wbuf.is_empty() || !rbuf.is_empty() { - write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap(); - } - - ret += &word_suffix; - } - ret += remainder; - - // CHECK - let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap(); - let back_to_ja = re.replace_all(&ret, "").replace("[[", ""); - if ex.ja != back_to_ja { - eprintln!("!!!! {} != {}", ex.ja, back_to_ja); - } - - ret -} - -fn expl_clean_word(w: &str) -> (&str, Option<&str>) { - let mut ret = w; - for delim in ['(', '{', '['] { - if let Some((s, _)) = ret.split_once(delim) { - ret = s; - } - } - let p = w - .split_once('(') - .and_then(|(_, r)| r.split_once(')')) - .map(|(p, _)| p); - (ret, p) -} - fn dict_str_short<'a>( qkeb: &str, qreb: Option<&str>, |