diff options
author | Alex Auvolat <alex@adnab.me> | 2023-09-25 16:23:04 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-09-25 16:23:04 +0200 |
commit | 8c6fe47d809eab3daad2e2b560295ecf4fa12796 (patch) | |
tree | 929eef5993b3ac6a064aa61bff301038c31af702 /src/main.rs | |
parent | 12bbe59e89fc5481c21163b7e1cbbb9a72bd470b (diff) | |
download | datagengo-8c6fe47d809eab3daad2e2b560295ecf4fa12796.tar.gz datagengo-8c6fe47d809eab3daad2e2b560295ecf4fa12796.zip |
remove modest quantities of extra vocabulary that also appear in sentences
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/src/main.rs b/src/main.rs index 9cc72fb..533f157 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1025,17 +1025,19 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { }; let mut done = Charset::default(); - for (i, batch) in all_batches.iter_mut().enumerate() { + let mut extra_vocab = vec![]; + for (i, batch) in all_batches.iter().enumerate() { let done_after = done.union(&batch.chars); - batch.extra_vocab = vocab + let batch_extra_vocab = vocab .iter() .filter(|v| v.chars.inter_len(&batch.chars) > 0) .filter(|v| match_level(batch, &v.level)) .filter(|v| v.chars.diff(&done_after).len() == 0) - .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))) + .filter(|v| !all_batches[i..std::cmp::min(all_batches.len(), i+10)].iter().any(|b| b.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji)))) .cloned() .collect::<Vec<_>>(); + extra_vocab.push(batch_extra_vocab); println!("---- BATCH #{:03} ----", i); for v in batch.extra_vocab.iter() { @@ -1044,6 +1046,10 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { done = done_after; } + + for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) { + batch.extra_vocab = vocab; + } } // ===================================================================== |