From 12bbe59e89fc5481c21163b7e1cbbb9a72bd470b Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Sep 2023 15:10:41 +0200 Subject: Add extra vocabulary from JLPT lists --- src/main.rs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 48d8bf9..9cc72fb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -30,6 +30,7 @@ enum Cmd { }, Simplify, Cleanup, + AddVocab, Format, } @@ -119,6 +120,21 @@ fn main() { ) .expect("save"); } + Cmd::AddVocab => { + let mut batches: Vec = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice(&x)?)) + .unwrap_or_default(); + let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab"); + add_vocab(&mut batches, &jlpt_vocab); + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches) + .expect("serialize") + .as_bytes(), + ) + .expect("save"); + } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( @@ -297,7 +313,7 @@ fn read_examples(all_kanji: &Charset) -> Result> { Ok(ret) } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] struct JlptVocab { level: String, chars: Charset, @@ -392,6 +408,25 @@ fn jlpt_vocab_read_file(file: &str) -> Result> { Ok(ret) } +fn load_jlpt_vocab() -> Result> { + let file = fs::File::open("data/jlpt_vocab.txt")?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.splitn(5, "\t").collect::>(); + if line.len() == 5 { + ret.push(JlptVocab { + level: line[0].to_string(), + chars: Charset::new(line[1]), + kanji: line[2].to_string(), + kana: line[3].to_string(), + en: line[4].to_string(), + }); + } + } + Ok(ret) +} + // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== @@ -416,6 +451,8 @@ struct Batch { chars_p2: Charset, chars_bad: Charset, examples: Vec, + #[serde(default)] + extra_vocab: Vec, } fn gen_batches( @@ -970,6 +1007,45 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)] } } +fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { + let match_level = |batch: &Batch, level: &str| { + let n4 = batch.level.contains("N4"); + let n3 = batch.level.contains("N3"); + let n2 = batch.level.contains("N2"); + let n1 = batch.level.contains("N1"); + let n0 = batch.level.contains("N0"); + match level { + "N4" => n4 || n3 || n2 || n1 || n0, + "N3" => n3 || n2 || n1 || n0, + "N2" | "N2a" | "N2b" => n2 || n1 || n0, + "N1" => n1 || n0, + "N0" => n0, + _ => panic!("invalid vocab level {}", level), + } + }; + + let mut done = Charset::default(); + for (i, batch) in all_batches.iter_mut().enumerate() { + let done_after = done.union(&batch.chars); + + batch.extra_vocab = vocab + .iter() + .filter(|v| v.chars.inter_len(&batch.chars) > 0) + .filter(|v| match_level(batch, &v.level)) + .filter(|v| v.chars.diff(&done_after).len() == 0) + .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))) + .cloned() + .collect::>(); + + println!("---- BATCH #{:03} ----", i); + for v in batch.extra_vocab.iter() { + println!("{}", v.to_string()); + } + + done = done_after; + } +} + // ===================================================================== // FORMATTING TO HTML // ===================================================================== @@ -1073,6 +1149,26 @@ fn format_batch_aux<'a>( writeln!(f, r#""#)?; } + writeln!(f, "
")?; + writeln!(f, r#"
Extra vocabulary (this level)"#)?; + for v in batch.extra_vocab.iter() { + if batch.level.contains(&v.level) { + writeln!(f, r#"

({}) {} [{}] {}

"#, v.level, v.kanji, v.kana, v.en)?; + } + } + writeln!(f, r#"
"#)?; + if !batch.level.contains("N4") { + writeln!(f, r#"
Extra vocabulary (previous levels)"#)?; + for v in batch.extra_vocab.iter() { + if !batch.level.contains(&v.level) { + writeln!(f, r#"

({}) {} [{}] {}

"#, v.level, v.kanji, v.kana, v.en)?; + } + } + writeln!(f, r#"
"#)?; + } + writeln!(f, "
")?; + writeln!(f, "

\(≧▽≦)/

")?; + write!(f, "")?; f.flush()?; Ok(()) -- cgit v1.2.3