aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-25 15:10:41 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-25 15:10:41 +0200
commit12bbe59e89fc5481c21163b7e1cbbb9a72bd470b (patch)
tree4b60d8c7534235cae9d2453fb3a82e952020e244 /src
parentbde10f88a32a1c9deb274b81938155ebcc2e9400 (diff)
downloaddatagengo-12bbe59e89fc5481c21163b7e1cbbb9a72bd470b.tar.gz
datagengo-12bbe59e89fc5481c21163b7e1cbbb9a72bd470b.zip
Add extra vocabulary from JLPT lists
Diffstat (limited to 'src')
-rw-r--r--src/main.rs98
1 files changed, 97 insertions, 1 deletions
diff --git a/src/main.rs b/src/main.rs
index 48d8bf9..9cc72fb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -30,6 +30,7 @@ enum Cmd {
},
Simplify,
Cleanup,
+ AddVocab,
Format,
}
@@ -119,6 +120,21 @@ fn main() {
)
.expect("save");
}
+ Cmd::AddVocab => {
+ let mut batches: Vec<Batch> = fs::read("data/batches.json")
+ .map_err(anyhow::Error::from)
+ .and_then(|x| Ok(serde_json::from_slice(&x)?))
+ .unwrap_or_default();
+ let jlpt_vocab = load_jlpt_vocab().expect("load_jlpt_vocab");
+ add_vocab(&mut batches, &jlpt_vocab);
+ fs::write(
+ "data/batches.json",
+ serde_json::to_string_pretty(&batches)
+ .expect("serialize")
+ .as_bytes(),
+ )
+ .expect("save");
+ }
Cmd::Format => {
let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
@@ -297,7 +313,7 @@ fn read_examples(all_kanji: &Charset) -> Result<Vec<Example>> {
Ok(ret)
}
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
struct JlptVocab {
level: String,
chars: Charset,
@@ -392,6 +408,25 @@ fn jlpt_vocab_read_file(file: &str) -> Result<Vec<(String, String)>> {
Ok(ret)
}
+fn load_jlpt_vocab() -> Result<Vec<JlptVocab>> {
+ let file = fs::File::open("data/jlpt_vocab.txt")?;
+ let mut ret = vec![];
+ for line in io::BufReader::new(file).lines() {
+ let line = line?;
+ let line = line.splitn(5, "\t").collect::<Vec<_>>();
+ if line.len() == 5 {
+ ret.push(JlptVocab {
+ level: line[0].to_string(),
+ chars: Charset::new(line[1]),
+ kanji: line[2].to_string(),
+ kana: line[3].to_string(),
+ en: line[4].to_string(),
+ });
+ }
+ }
+ Ok(ret)
+}
+
// =====================================================================
// BATCH STRUCTURES AND GENERATION
// =====================================================================
@@ -416,6 +451,8 @@ struct Batch {
chars_p2: Charset,
chars_bad: Charset,
examples: Vec<Example>,
+ #[serde(default)]
+ extra_vocab: Vec<JlptVocab>,
}
fn gen_batches(
@@ -970,6 +1007,45 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]
}
}
+fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
+ let match_level = |batch: &Batch, level: &str| {
+ let n4 = batch.level.contains("N4");
+ let n3 = batch.level.contains("N3");
+ let n2 = batch.level.contains("N2");
+ let n1 = batch.level.contains("N1");
+ let n0 = batch.level.contains("N0");
+ match level {
+ "N4" => n4 || n3 || n2 || n1 || n0,
+ "N3" => n3 || n2 || n1 || n0,
+ "N2" | "N2a" | "N2b" => n2 || n1 || n0,
+ "N1" => n1 || n0,
+ "N0" => n0,
+ _ => panic!("invalid vocab level {}", level),
+ }
+ };
+
+ let mut done = Charset::default();
+ for (i, batch) in all_batches.iter_mut().enumerate() {
+ let done_after = done.union(&batch.chars);
+
+ batch.extra_vocab = vocab
+ .iter()
+ .filter(|v| v.chars.inter_len(&batch.chars) > 0)
+ .filter(|v| match_level(batch, &v.level))
+ .filter(|v| v.chars.diff(&done_after).len() == 0)
+ .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji)))
+ .cloned()
+ .collect::<Vec<_>>();
+
+ println!("---- BATCH #{:03} ----", i);
+ for v in batch.extra_vocab.iter() {
+ println!("{}", v.to_string());
+ }
+
+ done = done_after;
+ }
+}
+
// =====================================================================
// FORMATTING TO HTML
// =====================================================================
@@ -1073,6 +1149,26 @@ fn format_batch_aux<'a>(
writeln!(f, r#"</details>"#)?;
}
+ writeln!(f, "<hr />")?;
+ writeln!(f, r#"<details><summary>Extra vocabulary (this level)</summary>"#)?;
+ for v in batch.extra_vocab.iter() {
+ if batch.level.contains(&v.level) {
+ writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?;
+ }
+ }
+ writeln!(f, r#"</details>"#)?;
+ if !batch.level.contains("N4") {
+ writeln!(f, r#"<details><summary>Extra vocabulary (previous levels)</summary>"#)?;
+ for v in batch.extra_vocab.iter() {
+ if !batch.level.contains(&v.level) {
+ writeln!(f, r#"<p>({}) {} [{}] {}</p>"#, v.level, v.kanji, v.kana, v.en)?;
+ }
+ }
+ writeln!(f, r#"</details>"#)?;
+ }
+ writeln!(f, "<hr />")?;
+ writeln!(f, "<p>\(≧▽≦)/</p>")?;
+
write!(f, "</body></html>")?;
f.flush()?;
Ok(())