From bde10f88a32a1c9deb274b81938155ebcc2e9400 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Sep 2023 14:43:31 +0200 Subject: add vocab per jlpt --- src/main.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 335add8..48d8bf9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,6 +21,7 @@ struct Opt { #[derive(Debug, StructOpt)] enum Cmd { ParseKanjidic, + ParseJlptVocab, New { #[structopt(default_value = "10")] count: usize, @@ -42,6 +43,14 @@ fn main() { println!("{}: {}", level, chars.to_string()); } } + Cmd::ParseJlptVocab => { + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); + let all_kanji = Charset::from_iter(kanji_levels + .iter() + .map(|(_, c)| c.chars()) + .flatten()); + parse_jlpt_vocab(&all_kanji).expect("error"); + } Cmd::New { truncate, count } => { let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); let all_kanji = Charset::new( @@ -288,6 +297,101 @@ fn read_examples(all_kanji: &Charset) -> Result> { Ok(ret) } +#[derive(Clone, Debug)] +struct JlptVocab { + level: String, + chars: Charset, + kanji: String, + kana: String, + en: String, +} + +impl JlptVocab { + fn to_string(&self) -> String { + format!("{}\t{}\t{}\t{}\t{}", + self.level, + self.chars.to_string(), + self.kanji, + self.kana, + self.en) + } +} + +fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { + let mut vocab = vec![]; + vocab.extend(parse_jlpt_vocab_combined("data/n5_vocab.txt", "N4", all_kanji)?); + vocab.extend(parse_jlpt_vocab_split("data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", "N3", all_kanji)?); + vocab.extend(parse_jlpt_vocab_split("data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", "N2a", all_kanji)?); + vocab.extend(parse_jlpt_vocab_split("data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", "N2b", all_kanji)?); + vocab.extend(parse_jlpt_vocab_split("data/n1_vocab_hiragana.txt", "data/n1_vocab_eng.txt", "N1", all_kanji)?); + for v in vocab.iter() { + println!("{}", v.to_string()); + } + Ok(()) +} + +fn parse_jlpt_vocab_combined(file: &str, level: &str, all_kanji: &Charset) -> Result> { + let lines = jlpt_vocab_read_file(file)?; + let mut ret = vec![]; + for (kanji, answer) in lines { + let (eng, kana) = match answer.split_once('\n') { + Some((a, b)) => (a, b.trim()), + None => (answer.trim(), ""), + }; + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string() + }); + } + } + Ok(ret) +} + +fn parse_jlpt_vocab_split(kana_file: &str, eng_file: &str, level: &str, all_kanji: &Charset) -> Result> { + let eng_lines = jlpt_vocab_read_file(eng_file)? + .into_iter() + .collect::>(); + + let lines = jlpt_vocab_read_file(kana_file)?; + let mut ret = vec![]; + for (kanji, kana) in lines { + let eng = eng_lines.get(&kanji).or(eng_lines.get(&kana)); + if let Some(eng) = eng { + for kanji in kanji.split('/') { + ret.push(JlptVocab { + level: level.to_string(), + chars: Charset::new(kanji).inter(all_kanji), + kanji: kanji.to_string(), + kana: kana.to_string(), + en: eng.to_string() + }); + } + } + } + Ok(ret) +} + +fn jlpt_vocab_read_file(file: &str) -> Result> { + let re = regex::Regex::new(r#""#)?; + + let file = fs::File::open(file)?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?.replace("
", "\n") + .replace("
", ""); + let line = re.replace_all(&line, ""); + if let Some((a, b)) = line.split_once('|') { + ret.push((a.trim().to_string(), b.trim().to_string())); + } + } + + Ok(ret) +} + // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== -- cgit v1.2.3