aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-26 15:22:20 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-26 15:22:20 +0200
commitfe5d9cfea41565dc13adeb169fbd444ab72b023f (patch)
tree6364d23099a92ae665f84eeb3e5317c98c99bfc9 /src
parent6ef097e3317bd9a2f7ea4426e63427fba5e7d215 (diff)
downloaddatagengo-fe5d9cfea41565dc13adeb169fbd444ab72b023f.tar.gz
datagengo-fe5d9cfea41565dc13adeb169fbd444ab72b023f.zip
Switch to new JLPT levels and regenerate batches 014-
Diffstat (limited to 'src')
-rw-r--r--src/main.rs36
1 files changed, 23 insertions, 13 deletions
diff --git a/src/main.rs b/src/main.rs
index c882741..597ebdf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -191,6 +191,8 @@ fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
}
fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
+ let n3_kanji = Charset::new(&fs::read_to_string("data/n3_kanji.txt")?.trim());
+
let file = fs::read_to_string("data/kanjidic2.xml")?;
let xml = roxmltree::Document::parse(&file)?;
let kanjidic = xml.root().first_child().unwrap();
@@ -225,6 +227,13 @@ fn parse_kanjidic() -> Result<Vec<(String, Charset)>> {
_ => (),
}
if let Some(lit) = literal {
+ assert_eq!(lit.chars().count(), 1);
+ let jlpt = match jlpt {
+ Some(4) => Some(5),
+ Some(3) => Some(4),
+ Some(2) if n3_kanji.contains(lit.chars().next().unwrap()) => Some(3),
+ x => x,
+ };
levels
.entry((jlpt, grade))
.or_insert(String::new())
@@ -338,25 +347,25 @@ fn parse_jlpt_vocab(all_kanji: &Charset) -> Result<()> {
let mut vocab = vec![];
vocab.extend(parse_jlpt_vocab_combined(
"data/n5_vocab.txt",
- "N4",
+ "N5",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n4_vocab_hiragana.txt",
"data/n4_vocab_eng.txt",
- "N3",
+ "N4",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n3_vocab_hiragana.txt",
"data/n3_vocab_eng.txt",
- "N2a",
+ "N3",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
"data/n2_vocab_hiragana.txt",
"data/n2_vocab_eng.txt",
- "N2b",
+ "N2",
all_kanji,
)?);
vocab.extend(parse_jlpt_vocab_split(
@@ -685,11 +694,7 @@ fn gen_level(
}
// Find combination that does that with a good number of examples (tgt_len)
- let factor = match level {
- "N1b" => 1.08,
- _ => 1.0,
- };
- let tgt_len = (avg_len * factor * (batch_count as f32 + 1.)).ceil() as i64
+ let tgt_len = (avg_len * (batch_count as f32 + 1.)).ceil() as i64
- (sum_len + batch.examples.len()) as i64;
let dyn_mat_cnt = |i| {
let mut cnt = 0;
@@ -874,12 +879,15 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]
.flatten(),
);
- batch.level = kanji_levels
+ let mut levels = kanji_levels
.iter()
.filter(|(_, chars)| chars.inter_len(&batch.chars) > 0)
.map(|(lvl, _)| lvl.to_string())
- .collect::<Vec<_>>()
- .join("/");
+ .collect::<Vec<_>>();
+ while levels.len() > 2 {
+ levels.remove(1);
+ }
+ batch.level = levels.join("/");
done = done.union(&batch.chars);
batch.chars_bad = all_chars.diff(&done);
batch.chars_p1 = all_chars.inter(&chars_p1);
@@ -892,15 +900,17 @@ fn cleanup_batches(all_batches: &mut [Batch], kanji_levels: &[(String, Charset)]
fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
let match_level = |batch: &Batch, level: &str| {
+ let n5 = batch.level.contains("N5");
let n4 = batch.level.contains("N4");
let n3 = batch.level.contains("N3");
let n2 = batch.level.contains("N2");
let n1 = batch.level.contains("N1");
let n0 = batch.level.contains("N0");
match level {
+ "N5" => n5 || n4 || n3 || n2 || n1 || n0,
"N4" => n4 || n3 || n2 || n1 || n0,
"N3" => n3 || n2 || n1 || n0,
- "N2" | "N2a" | "N2b" => n2 || n1 || n0,
+ "N2" => n2 || n1 || n0,
"N1" => n1 || n0,
"N0" => n0,
_ => panic!("invalid vocab level {}", level),