diff options
author | Alex Auvolat <alex@adnab.me> | 2023-09-26 15:22:20 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-09-26 15:22:20 +0200 |
commit | fe5d9cfea41565dc13adeb169fbd444ab72b023f (patch) | |
tree | 6364d23099a92ae665f84eeb3e5317c98c99bfc9 /src | |
parent | 6ef097e3317bd9a2f7ea4426e63427fba5e7d215 (diff) | |
download | datagengo-fe5d9cfea41565dc13adeb169fbd444ab72b023f.tar.gz datagengo-fe5d9cfea41565dc13adeb169fbd444ab72b023f.zip |
Switch to new JLPT levels and regenerate batches 014-
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 36 |
1 files changed, 23 insertions, 13 deletions
diff --git a/src/main.rs b/src/main.rs index c882741..597ebdf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -191,6 +191,8 @@ fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { } fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { + let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim()); + let file = fs::read_to_string("data/kanjidic2.xml")?; let xml = roxmltree::Document::parse(&file)?; let kanjidic = xml.root().first_child().unwrap(); @@ -225,6 +227,13 @@ fn parse_kanjidic() -> Result<Vec<(String, Charset)>> { _ => (), } if let Some(lit) = literal { + assert_eq!(lit.chars().count(), 1); + let jlpt = match jlpt { + Some(4) => Some(5), + Some(3) => Some(4), + Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3), + x => x, + }; levels .entry((jlpt, grade)) .or_insert(String::new()) @@ -338,25 +347,25 @@ fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> { let mut vocab = vec![]; vocab.extend(parse_jlpt_vocab_combined( "data/n5_vocab.txt", - "N4", + "N5", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n4_vocab_hiragana.txt", "data/n4_vocab_eng.txt", - "N3", + "N4", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n3_vocab_hiragana.txt", "data/n3_vocab_eng.txt", - "N2a", + "N3", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( "data/n2_vocab_hiragana.txt", "data/n2_vocab_eng.txt", - "N2b", + "N2", all_kanji, )?); vocab.extend(parse_jlpt_vocab_split( @@ -685,11 +694,7 @@ fn gen_level( } // Find combination that does that with a good number of examples (tgt_len) - let factor = match level { - "N1b" => 1.08, - _ => 1.0, - }; - let tgt_len = (avg_len * factor * (batch_count as f32 + 1.)).ceil() as i64 + let tgt_len = (avg_len * (batch_count as f32 + 1.)).ceil() as i64 - (sum_len + batch.examples.len()) as i64; let dyn_mat_cnt = |i| { let mut cnt = 0; @@ -874,12 +879,15 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)] .flatten(), ); - batch.level = kanji_levels + let mut levels = kanji_levels .iter() .filter(|(_, chars)| chars.inter_len(&batch.chars) > 0) .map(|(lvl, _)| lvl.to_string()) - .collect::<Vec<_>>() - .join("/"); + .collect::<Vec<_>>(); + while levels.len() > 2 { + levels.remove(1); + } + batch.level = levels.join("/"); done = done.union(&batch.chars); batch.chars_bad = all_chars.diff(&done); batch.chars_p1 = all_chars.inter(&chars_p1); @@ -892,15 +900,17 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)] fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { let match_level = |batch: &Batch, level: &str| { + let n5 = batch.level.contains("N5"); let n4 = batch.level.contains("N4"); let n3 = batch.level.contains("N3"); let n2 = batch.level.contains("N2"); let n1 = batch.level.contains("N1"); let n0 = batch.level.contains("N0"); match level { + "N5" => n5 || n4 || n3 || n2 || n1 || n0, "N4" => n4 || n3 || n2 || n1 || n0, "N3" => n3 || n2 || n1 || n0, - "N2" | "N2a" | "N2b" => n2 || n1 || n0, + "N2" => n2 || n1 || n0, "N1" => n1 || n0, "N0" => n0, _ => panic!("invalid vocab level {}", level), |