use std::collections::HashMap; use std::fs; use std::io::{self, BufRead}; use anyhow::Result; use serde::{Deserialize, Serialize}; use crate::charset::Charset; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct Example { pub ja: String, pub en: String, pub expl: String, #[serde(default)] pub furigana: Option, pub id: Option, pub chars: Charset, } // ===================================================================== // PARSING DATA FILES // ===================================================================== pub type DictIndex<'a> = HashMap<&'a str, Vec>>; pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); let mut ret: DictIndex<'a> = HashMap::new(); for x in dict.children().filter(|x| x.has_tag_name("entry")) { for r in x.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { let txt = keb.text().unwrap().trim(); ret.entry(txt).or_default().push(x); } } } ret } pub fn parse_kanjidic() -> Result> { let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; let kanjidic = xml.root().first_child().unwrap(); assert!(kanjidic.has_tag_name("kanjidic2")); let mut levels = HashMap::new(); for x in kanjidic.children() { if !x.has_tag_name("character") { continue; } let mut literal = None; let mut jlpt = None; let mut grade = None; for y in x.children() { if y.has_tag_name("literal") { literal = y.text(); } if y.has_tag_name("misc") { for z in y.children() { if z.has_tag_name("jlpt") { jlpt = z.text().and_then(|x| str::parse::(x).ok()); } if z.has_tag_name("grade") { grade = z.text().and_then(|x| str::parse::(x).ok()); } } } } match grade { Some(i) if i <= 6 => grade = Some(7), _ => (), } if let Some(lit) = literal { assert_eq!(lit.chars().count(), 1); let jlpt = match jlpt { Some(4) => Some(5), Some(3) => Some(4), Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), x => x, }; levels .entry((jlpt, grade)) .or_insert(String::new()) .extend(lit.chars()); } } let mut levels = levels.into_iter().collect::>(); levels.sort_by_key(|((j, g), _)| match (j, g) { (Some(j), Some(g)) => (10 - *j) * 20 + *g, (Some(j), None) => (10 - *j) * 20 + 15, (None, Some(g)) => 1000 + *g, (None, None) => 1015, }); let mut ret = Vec::new(); let mut pc = Charset::default(); for ((j, g), chars) in levels.into_iter() { let name = match (j, g) { (Some(j), Some(7)) => format!("N{}a", j), (Some(j), Some(8)) => format!("N{}b", j), (Some(j), Some(g)) => format!("N{}-{}", j, g), (Some(j), None) => format!("N{}+", j), (None, Some(7)) => format!("N0a"), (None, Some(8)) => format!("N0b"), (None, Some(g)) => format!("N0-{}", g), (None, None) => format!("N0+"), }; let chars = Charset::new(chars).diff(&pc); pc = pc.union(&chars); ret.push((name, chars)); } Ok(ret) } pub fn read_kanji_levels() -> Result> { Ok(fs::read_to_string("data/kanji_levels.txt")? .lines() .filter_map(|l| l.split_once(": ")) .map(|(l, k)| (l.to_string(), k.to_string())) .collect::>()) } pub fn read_examples(all_kanji: &Charset) -> Result> { let file = fs::File::open("data/examples.utf")?; let mut ret = Vec::new(); let mut a = "".to_string(); for (i, line) in io::BufReader::new(file).lines().enumerate() { let line = line?; if line.starts_with("A:") { a = line; } else if line.starts_with("B:") { let s = a.strip_prefix("A: "); let t = line.strip_prefix("B: "); if let (Some(a), Some(b)) = (s, t) { if let Some((ja, eng)) = a.split_once("\t") { if let Some((eng, id)) = eng.split_once("#") { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: Some(id.to_string()), chars: Charset::new(ja).inter(all_kanji), furigana: None, }); } else { ret.push(Example { ja: ja.to_string(), en: eng.to_string(), expl: b.to_string(), id: None, chars: Charset::new(ja).inter(all_kanji), furigana: None, }); } } } } if i % 10000 == 0 { eprintln!("read examples: {}/300 (x1000)", i / 1000); } } Ok(ret) } pub fn read_furigana_overrides() -> Result> { let file = fs::File::open("data/furigana_overrides")?; let mut ret = HashMap::new(); let re = regex::Regex::new(r#"\|\|\w+\]\]"#)?; for line in io::BufReader::new(file).lines() { let line = line?; let line = line.trim(); if !line.is_empty() { let clean = re.replace_all(line, "").replace("[[", ""); if clean != line { ret.insert(clean, line.to_string()); } } } Ok(ret) } #[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] pub struct JlptVocab { pub level: String, pub chars: Charset, pub kanji: String, pub kana: String, pub en: String, } impl JlptVocab { pub fn to_string(&self) -> String { format!( "{}\t{}\t{}\t{}\t{}", self.level, self.chars.to_string(), self.kanji, self.kana, self.en ) } } pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { let mut vocab = vec![]; vocab.extend(parse_jlpt_vocab_combined( "data/n5_vocab.txt", "N5", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", "N4", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", "N3", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", "N2", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n1_vocab_hiragana.txt", "data/n1_vocab_eng.txt", "N1", all_kanji, )?); for v in vocab.iter() { println!("{}", v.to_string()); } Ok(()) } fn parse_jlpt_vocab_combined( file: &str, level: &str, all_kanji: &Charset, ) -> Result> { let lines = jlpt_vocab_read_file(file)?; let mut ret = vec![]; for (kanji, answer) in lines { let (eng, kana) = match answer.split_once('\n') { Some((a, b)) => (a, b.trim()), None => (answer.trim(), ""), }; for kanji in kanji.split('/') { ret.push(JlptVocab { level: level.to_string(), chars: Charset::new(kanji).inter(all_kanji), kanji: kanji.to_string(), kana: kana.to_string(), en: eng.to_string(), }); } } Ok(ret) } fn parse_jlpt_vocab_split( kana_file: &str, eng_file: &str, level: &str, all_kanji: &Charset, ) -> Result> { let eng_lines = jlpt_vocab_read_file(eng_file)? .into_iter() .collect::>(); let lines = jlpt_vocab_read_file(kana_file)?; let mut ret = vec![]; for (kanji, kana) in lines { let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); if let Some(eng) = eng { for kanji in kanji.split('/') { ret.push(JlptVocab { level: level.to_string(), chars: Charset::new(kanji).inter(all_kanji), kanji: kanji.to_string(), kana: kana.to_string(), en: eng.to_string(), }); } } } Ok(ret) } fn jlpt_vocab_read_file(file: &str) -> Result> { let re = regex::Regex::new(r#""#)?; let file = fs::File::open(file)?; let mut ret = vec![]; for line in io::BufReader::new(file).lines() { let line = line?.replace("
", "\n").replace("
", ""); let line = re.replace_all(&line, ""); if let Some((a, b)) = line.split_once('|') { ret.push((a.trim().to_string(), b.trim().to_string())); } } Ok(ret) } pub fn load_jlpt_vocab() -> Result> { let file = fs::File::open("data/jlpt_vocab.txt")?; let mut ret = vec![]; for line in io::BufReader::new(file).lines() { let line = line?; let line = line.splitn(5, "\t").collect::>(); if line.len() == 5 { ret.push(JlptVocab { level: line[0].to_string(), chars: Charset::new(line[1]), kanji: line[2].to_string(), kana: line[3].to_string(), en: line[4].to_string(), }); } } Ok(ret) }