diff options
author | Alex Auvolat <alex@adnab.me> | 2023-10-06 15:32:59 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-10-06 15:32:59 +0200 |
commit | 5aae4a8185d1417028a4b22d43fbac851d51a843 (patch) | |
tree | a70b8fcea5e5aedfb72a9689bb2864ecbf74de0c /src | |
parent | d602f00607aa23cf49485637fc88f0484746a80d (diff) | |
download | datagengo-5aae4a8185d1417028a4b22d43fbac851d51a843.tar.gz datagengo-5aae4a8185d1417028a4b22d43fbac851d51a843.zip |
add extra examples
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/src/main.rs b/src/main.rs index ce352d4..8668a82 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,6 +31,7 @@ enum Cmd { Simplify, Cleanup, AddVocab, + AddExamples, Format, } @@ -133,6 +134,34 @@ fn main() { ) .expect("save"); } + Cmd::AddExamples => { + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); + let all_kanji = Charset::new( + kanji_levels + .iter() + .map(|(_, x)| x.to_string()) + .collect::<Vec<_>>() + .join(""), + ); + + let mut ex = read_examples(&all_kanji).expect("read_examples"); + ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); + + let mut batches: Vec<Batch> = fs::read("data/batches.json") + .map_err(anyhow::Error::from) + .and_then(|x| Ok(serde_json::from_slice(&x)?)) + .unwrap_or_default(); + + add_examples(&mut batches, &ex); + + fs::write( + "data/batches.json", + serde_json::to_string_pretty(&batches) + .expect("serialize") + .as_bytes(), + ) + .expect("save"); + } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( @@ -495,6 +524,8 @@ struct Batch { examples: Vec<Example>, #[serde(default)] extra_vocab: Vec<JlptVocab>, + #[serde(default)] + extra_examples: Vec<Example>, } fn gen_batches( @@ -953,6 +984,32 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { } } +fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { + let mut chars = Charset::default(); + for (i, batch) in all_batches.iter_mut().enumerate() { + chars = chars.union(&batch.chars); + + let candidates = examples.iter() + .filter(|x| x.chars.inter_len(&batch.chars) > 0) + .filter(|x| x.chars.diff(&chars).len() == 0) + .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja)); + let mut cand_by_chars = HashMap::new(); + for c in candidates { + cand_by_chars.insert(c.chars.to_string(), c.clone()); + } + let mut candidates = cand_by_chars.into_iter().map(|(_, ex)| ex).collect::<Vec<_>>(); + candidates.shuffle(&mut thread_rng()); + candidates.truncate(20); + batch.extra_examples = candidates; + + + println!("---- BATCH #{:03} ----", i); + for ex in batch.extra_examples.iter() { + println!("{} - {}", ex.ja, ex.en); + } + } +} + // ===================================================================== // FORMATTING TO HTML // ===================================================================== @@ -1075,6 +1132,16 @@ fn format_batch_aux<'a>( &batch.extra_vocab.iter().filter(|v| !batch.level.contains(&v.level)).collect::<Vec<_>>(), "Extra vocabulary (previous levels)")?; + writeln!(f, r#"<details><summary>Extra examples (reading practice)</summary><table class="extratable">"#)?; + for ex in batch.extra_examples.iter() { + writeln!( + f, + r#"<tr><td style="text-align: center"><span style="font-size: 1.2em"> {} </span><br />{}</td></tr>"#, + ex.ja, ex.en + )?; + } + writeln!(f, r#"</table></details>"#)?; + writeln!(f, "<hr />")?; writeln!(f, "<p>\(≧▽≦)/</p>")?; |