diff options
author | Alex Auvolat <alex@adnab.me> | 2023-11-27 17:26:59 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-11-27 17:26:59 +0100 |
commit | d2a46c25219c21ac4f128da8512302935654d38e (patch) | |
tree | a6d66ac4639e4d68fe57f9e8da72b08ecfb14d9f /src/datafiles.rs | |
parent | b15723f33b486124a50408873d30998bb9d31b3b (diff) | |
download | datagengo-d2a46c25219c21ac4f128da8512302935654d38e.tar.gz datagengo-d2a46c25219c21ac4f128da8512302935654d38e.zip |
split code into several files
Diffstat (limited to 'src/datafiles.rs')
-rw-r--r-- | src/datafiles.rs | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs new file mode 100644 index 0000000..629badf --- /dev/null +++ b/src/datafiles.rs @@ -0,0 +1,321 @@ +use std::collections::HashMap; +use std::fs; +use std::io::{self, BufRead}; + +use anyhow::Result; +use serde::{Deserialize, Serialize}; + +use crate::charset::Charset; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Example { + pub ja: String, + pub en: String, + pub expl: String, + pub id: Option<String>, + pub chars: Charset, +} + +// ===================================================================== +// PARSING DATA FILES +// ===================================================================== + +pub type DictIndex<'a> = HashMap<&'a str, Vec<roxmltree::Node<'a, 'a>>>; +pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { + let dict = dict + .root() + .children() + .find(|x| x.has_tag_name("JMdict")) + .unwrap(); + + let mut ret: DictIndex<'a> = HashMap::new(); + for x in dict.children().filter(|x| x.has_tag_name("entry")) { + for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { + let txt = keb.text().unwrap().trim(); + ret.entry(txt).or_default().push(x); + } + } + } + + ret +} + +pub fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { + let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); + + let file = fs::read_to_string("data/kanjidic2.xml")?; + let xml = roxmltree::Document::parse(&file)?; + let kanjidic = xml.root().first_child().unwrap(); + assert!(kanjidic.has_tag_name("kanjidic2")); + + let mut levels = HashMap::new(); + + for x in kanjidic.children() { + if !x.has_tag_name("character") { + continue; + } + let mut literal = None; + let mut jlpt = None; + let mut grade = None; + for y in x.children() { + if y.has_tag_name("literal") { + literal = y.text(); + } + if y.has_tag_name("misc") { + for z in y.children() { + if z.has_tag_name("jlpt") { + jlpt = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + if z.has_tag_name("grade") { + grade = z.text().and_then(|x| str::parse::<i32>(x).ok()); + } + } + } + } + match grade { + Some(i) if i <= 6 => grade = Some(7), + _ => (), + } + if let Some(lit) = literal { + assert_eq!(lit.chars().count(), 1); + let jlpt = match jlpt { + Some(4) => Some(5), + Some(3) => Some(4), + Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), + x => x, + }; + levels + .entry((jlpt, grade)) + .or_insert(String::new()) + .extend(lit.chars()); + } + } + + let mut levels = levels.into_iter().collect::<Vec<_>>(); + levels.sort_by_key(|((j, g), _)| match (j, g) { + (Some(j), Some(g)) => (10 - *j) * 20 + *g, + (Some(j), None) => (10 - *j) * 20 + 15, + (None, Some(g)) => 1000 + *g, + (None, None) => 1015, + }); + + let mut ret = Vec::new(); + let mut pc = Charset::default(); + for ((j, g), chars) in levels.into_iter() { + let name = match (j, g) { + (Some(j), Some(7)) => format!("N{}a", j), + (Some(j), Some(8)) => format!("N{}b", j), + (Some(j), Some(g)) => format!("N{}-{}", j, g), + (Some(j), None) => format!("N{}+", j), + (None, Some(7)) => format!("N0a"), + (None, Some(8)) => format!("N0b"), + (None, Some(g)) => format!("N0-{}", g), + (None, None) => format!("N0+"), + }; + let chars = Charset::new(chars).diff(&pc); + pc = pc.union(&chars); + ret.push((name, chars)); + } + + Ok(ret) +} + +pub fn read_kanji_levels() -> Result<Vec<(String, String)>> { + Ok(fs::read_to_string("data/kanji_levels.txt")? + .lines() + .filter_map(|l| l.split_once(": ")) + .map(|(l, k)| (l.to_string(), k.to_string())) + .collect::<Vec<_>>()) +} + +pub fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { + let file = fs::File::open("data/examples.utf")?; + + let mut ret = Vec::new(); + let mut a = "".to_string(); + + for (i, line) in io::BufReader::new(file).lines().enumerate() { + let line = line?; + if line.starts_with("A:") { + a = line; + } else if line.starts_with("B:") { + let s = a.strip_prefix("A: "); + let t = line.strip_prefix("B: "); + if let (Some(a), Some(b)) = (s, t) { + if let Some((ja, eng)) = a.split_once("\t") { + if let Some((eng, id)) = eng.split_once("#") { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: Some(id.to_string()), + chars: Charset::new(ja).inter(all_kanji), + }); + } else { + ret.push(Example { + ja: ja.to_string(), + en: eng.to_string(), + expl: b.to_string(), + id: None, + chars: Charset::new(ja).inter(all_kanji), + }); + } + } + } + } + if i % 10000 == 0 { + eprintln!("read examples: {}/300 (x1000)", i / 1000); + } + } + + Ok(ret) +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct JlptVocab { + pub level: String, + pub chars: Charset, + pub kanji: String, + pub kana: String, + pub en: String, +} + +impl JlptVocab { + pub fn to_string(&self) -> String { + format!( + "{}\t{}\t{}\t{}\t{}", + self.level, + self.chars.to_string(), + self.kanji, + self.kana, + self.en + ) + } +} + +pub fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { + let mut vocab = vec![]; + vocab.extend(parse_jlpt_vocab_combined( + "data/n5_vocab.txt", + "N5", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n4_vocab_hiragana.txt", + "data/n4_vocab_eng.txt", + "N4", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n3_vocab_hiragana.txt", + "data/n3_vocab_eng.txt", + "N3", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n2_vocab_hiragana.txt", + "data/n2_vocab_eng.txt", + "N2", + all_kanji, + )?); + vocab.extend(parse_jlpt_vocab_split( + "data/n1_vocab_hiragana.txt", + "data/n1_vocab_eng.txt", + "N1", + all_kanji, + )?); + for v in vocab.iter() { + println!("{}", v.to_string()); + } + Ok(()) +} + +fn parse_jlpt_vocab_combined( + file: &str, + level: &str, + all_kanji: &Charset, +) -> Result<Vec<JlptVocab>> { + let lines = jlpt_vocab_read_file(file)?; + let mut ret = vec![]; + for (kanji, answer) in lines { + let (eng, kana) = match answer.split_once('\n') { + Some((a, b)) => (a, b.trim()), + None => (answer.trim(), ""), + }; + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string(), + }); + } + } + Ok(ret) +} + +fn parse_jlpt_vocab_split( + kana_file: &str, + eng_file: &str, + level: &str, + all_kanji: &Charset, +) -> Result<Vec<JlptVocab>> { + let eng_lines = jlpt_vocab_read_file(eng_file)? + .into_iter() + .collect::<HashMap<String, String>>(); + + let lines = jlpt_vocab_read_file(kana_file)?; + let mut ret = vec![]; + for (kanji, kana) in lines { + let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); + if let Some(eng) = eng { + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string(), + }); + } + } + } + Ok(ret) +} + +fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> { + let re = regex::Regex::new(r#"<span class="\w+">"#)?; + + let file = fs::File::open(file)?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?.replace("<br>", "\n").replace("</span>", ""); + let line = re.replace_all(&line, ""); + if let Some((a, b)) = line.split_once('|') { + ret.push((a.trim().to_string(), b.trim().to_string())); + } + } + + Ok(ret) +} + +pub fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> { + let file = fs::File::open("data/jlpt_vocab.txt")?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.splitn(5, "\t").collect::<Vec<_>>(); + if line.len() == 5 { + ret.push(JlptVocab { + level: line[0].to_string(), + chars: Charset::new(line[1]), + kanji: line[2].to_string(), + kana: line[3].to_string(), + en: line[4].to_string(), + }); + } + } + Ok(ret) +} |