diff options
author | Alex Auvolat <alex@adnab.me> | 2023-09-25 15:10:41 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-09-25 15:10:41 +0200 |
commit | 12bbe59e89fc5481c21163b7e1cbbb9a72bd470b (patch) | |
tree | 4b60d8c7534235cae9d2453fb3a82e952020e244 /src | |
parent | bde10f88a32a1c9deb274b81938155ebcc2e9400 (diff) | |
download | datagengo-12bbe59e89fc5481c21163b7e1cbbb9a72bd470b.tar.gz datagengo-12bbe59e89fc5481c21163b7e1cbbb9a72bd470b.zip |
Add extra vocabulary from JLPT lists
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 98 |
1 files changed, 97 insertions, 1 deletions
diff --git a/src/main.rs b/src/main.rs index 48d8bf9..9cc72fb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -30,6 +30,7 @@ enum Cmd { }, Simplify, Cleanup, + AddVocab, Format, } @@ -119,6 +120,21 @@ fn main() { ) .expect("save"); } + Cmd::AddVocab => { + let mut batches: Vec<Batch> = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice(&x)?)) + .unwrap_or_default(); + let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab"); + add_vocab(&mut batches, &jlpt_vocab); + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches) + .expect("serialize") + .as_bytes(), + ) + .expect("save"); + } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( @@ -297,7 +313,7 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> { Ok(ret) } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] struct JlptVocab { level: String, chars: Charset, @@ -392,6 +408,25 @@ fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> { Ok(ret) } +fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> { + let file = fs::File::open("data/jlpt_vocab.txt")?; + let mut ret = vec![]; + for line in io::BufReader::new(file).lines() { + let line = line?; + let line = line.splitn(5, "\t").collect::<Vec<_>>(); + if line.len() == 5 { + ret.push(JlptVocab { + level: line[0].to_string(), + chars: Charset::new(line[1]), + kanji: line[2].to_string(), + kana: line[3].to_string(), + en: line[4].to_string(), + }); + } + } + Ok(ret) +} + // ===================================================================== // BATCH STRUCTURES AND GENERATION // ===================================================================== @@ -416,6 +451,8 @@ struct Batch { chars_p2: Charset, chars_bad: Charset, examples: Vec<Example>, + #[serde(default)] + extra_vocab: Vec<JlptVocab>, } fn gen_batches( @@ -970,6 +1007,45 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)] } } +fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { + let match_level = |batch: &Batch, level: &str| { + let n4 = batch.level.contains("N4"); + let n3 = batch.level.contains("N3"); + let n2 = batch.level.contains("N2"); + let n1 = batch.level.contains("N1"); + let n0 = batch.level.contains("N0"); + match level { + "N4" => n4 || n3 || n2 || n1 || n0, + "N3" => n3 || n2 || n1 || n0, + "N2" | "N2a" | "N2b" => n2 || n1 || n0, + "N1" => n1 || n0, + "N0" => n0, + _ => panic!("invalid vocab level {}", level), + } + }; + + let mut done = Charset::default(); + for (i, batch) in all_batches.iter_mut().enumerate() { + let done_after = done.union(&batch.chars); + + batch.extra_vocab = vocab + .iter() + .filter(|v| v.chars.inter_len(&batch.chars) > 0) + .filter(|v| match_level(batch, &v.level)) + .filter(|v| v.chars.diff(&done_after).len() == 0) + .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))) + .cloned() + .collect::<Vec<_>>(); + + println!("---- BATCH #{:03} ----", i); + for v in batch.extra_vocab.iter() { + println!("{}", v.to_string()); + } + + done = done_after; + } +} + // ===================================================================== // FORMATTING TO HTML // ===================================================================== @@ -1073,6 +1149,26 @@ fn format_batch_aux<'a>( writeln!(f, r#"</details>"#)?; } + writeln!(f, "<hr />")?; + writeln!(f, r#"<details><summary>Extra vocabulary (this level)</summary>"#)?; + for v in batch.extra_vocab.iter() { + if batch.level.contains(&v.level) { + writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?; + } + } + writeln!(f, r#"</details>"#)?; + if !batch.level.contains("N4") { + writeln!(f, r#"<details><summary>Extra vocabulary (previous levels)</summary>"#)?; + for v in batch.extra_vocab.iter() { + if !batch.level.contains(&v.level) { + writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?; + } + } + writeln!(f, r#"</details>"#)?; + } + writeln!(f, "<hr />")?; + writeln!(f, "<p>\(≧▽≦)/</p>")?; + write!(f, "</body></html>")?; f.flush()?; Ok(()) |