use std::collections::{HashMap, HashSet};
use crate::charset::Charset;
use crate::*;
impl Example {
pub fn gen_furigana(
&mut self,
dict_idx: &DictIndex,
overrides: &HashMap<String, String>,
) -> bool {
use std::fmt::Write;
if let Some(v) = overrides.get(&self.ja) {
self.furigana = Some(v.to_string());
return true;
}
let mut remainder = self.ja.as_str();
let mut ret = String::new();
let mut is_good = true;
for word in self.expl.split(|c| c == ' ' || c == '~') {
let (keb, reb) = expl_clean_word(word);
let word = word
.split_once('{')
.and_then(|(_, r)| r.split_once('}'))
.map(|(p, _)| p)
.unwrap_or(keb);
if let Some(i) = remainder.find(word) {
ret += &remainder[..i];
remainder = &remainder[i..];
}
let mut new_word = String::new();
for c in word.chars() {
if remainder.starts_with(c) {
remainder = remainder.strip_prefix(c).unwrap();
new_word.push(c);
} else {
is_good = false;
warn!("!!!! Char {} is not in remainder !!!!", c);
}
}
let word = &new_word;
if !Charset::new(word).intersects(&self.chars) {
ret += word;
continue;
}
let reb = match reb {
Some(reb) if reb.starts_with('#') => {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
if let Some(ent) = ents.iter().find(|ent| {
ent.ent_seq == reb.strip_prefix('#').unwrap().parse::<u64>().unwrap()
}) {
ent.reb.as_str()
} else {
is_good = false;
warn!("- entry id not found: {}", reb);
ret += &word;
continue;
}
}
Some(reb) => reb,
None => {
let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
let matches = ents
.iter()
.map(|ent| ent.reb.as_str())
.collect::<HashSet<_>>();
if matches.len() == 1 {
*matches.iter().next().unwrap()
} else {
is_good = false;
warn!("- word with {} reb: {}", matches.len(), word);
ret += &word;
continue;
}
}
};
//println!("+ word: {}, keb: {}, reb: {}", word, keb, reb);
let common_cnt = word
.chars()
.zip(keb.chars())
.take_while(|(x, y)| x == y)
.count();
if common_cnt == 0 {
// Strange cases
write!(&mut ret, "[[{}||{}]]", word, reb).unwrap();
continue;
}
let keb_suffix = keb.chars().skip(common_cnt).collect::<String>();
let word_suffix = word.chars().skip(common_cnt).collect::<String>();
let reb = reb
.strip_suffix(&keb_suffix)
.or(reb.strip_suffix(&word_suffix))
.unwrap_or(reb);
//println!(" common reb: {}, common word: {}", reb, word.chars().take(common_cnt).collect::<String>());
let wchars = Vec::from_iter(word.chars().take(common_cnt));
let rchars = Vec::from_iter(reb.chars());
// We shall invoke Levhenstein distance
let mut dynrow0 = vec![(0, 0, 0, false)];
for ri in 0..rchars.len() {
dynrow0.push((0, ri, 100 + ri + 1, false));
}
let mut dyntab = vec![dynrow0];
for (wi, wc) in wchars.iter().enumerate() {
let mut dynrow = vec![(wi, 0, 100 + wi + 1, false)];
for (ri, rc) in rchars.iter().enumerate() {
let mut x = vec![];
if dyntab[wi][ri + 1].3 {
x.push((wi, ri + 1, dyntab[wi][ri + 1].2 + 1, true));
}
if dynrow[ri].3 {
x.push((wi + 1, ri, dynrow[ri].2 + 1, true));
}
if wc == rc {
x.push((wi, ri, dyntab[wi][ri].2, false));
} else {
x.push((wi, ri, dyntab[wi][ri].2 + 1, true));
}
dynrow.push(x.into_iter().min_by_key(|(_, _, w, _)| *w).unwrap());
}
dyntab.push(dynrow);
}
//eprintln!("DYN TAB: {:?}", dyntab);
let mut path = vec![(wchars.len(), rchars.len())];
loop {
let (wi, ri) = *path.last().unwrap();
let (wi2, ri2, _, _) = dyntab[wi][ri];
path.push((wi2, ri2));
if wi2 == 0 && ri2 == 0 {
break;
}
}
path.reverse();
//eprintln!("DYN PATH: {:?}", path);
let mut wbuf = String::new();
let mut rbuf = String::new();
for ((wi1, ri1), (wi2, ri2)) in path.iter().copied().zip(path.iter().copied().skip(1)) {
if wi2 > wi1 && ri2 > ri1 && wchars[wi1] == rchars[ri1] {
if !wbuf.is_empty() || !rbuf.is_empty() {
write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
wbuf.clear();
rbuf.clear();
}
ret.push(wchars[wi1]);
} else {
if wi2 > wi1 {
wbuf.push(wchars[wi1]);
}
if ri2 > ri1 {
rbuf.push(rchars[ri1]);
}
}
}
if !wbuf.is_empty() || !rbuf.is_empty() {
write!(&mut ret, "[[{}||{}]]", wbuf, rbuf).unwrap();
}
ret += &word_suffix;
}
ret += remainder;
// CHECK
let re = regex::Regex::new(r#"\|\|\w+\]\]"#).unwrap();
let back_to_ja = re.replace_all(&ret, "").replace("[[", "");
if self.ja != back_to_ja {
is_good = false;
error!("!!!! {} != {}", self.ja, back_to_ja);
}
self.furigana = Some(ret);
is_good
}
pub fn furigana_markup(&self) -> String {
if let Some(furi) = &self.furigana {
furi.replace("[[", "<ruby>")
.replace("||", "<rt>")
.replace("]]", "</rt></ruby>")
} else {
self.ja.to_string()
}
}
}
pub fn expl_clean_word(w: &str) -> (&str, Option<&str>) {
let mut ret = w;
for delim in ['(', '{', '['] {
if let Some((s, _)) = ret.split_once(delim) {
ret = s;
}
}
let p = w
.split_once('(')
.and_then(|(_, r)| r.split_once(')'))
.map(|(p, _)| p);
(ret, p)
}