aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-09-25 16:23:04 +0200
committerAlex Auvolat <alex@adnab.me>2023-09-25 16:23:04 +0200
commit8c6fe47d809eab3daad2e2b560295ecf4fa12796 (patch)
tree929eef5993b3ac6a064aa61bff301038c31af702 /src
parent12bbe59e89fc5481c21163b7e1cbbb9a72bd470b (diff)
downloaddatagengo-8c6fe47d809eab3daad2e2b560295ecf4fa12796.tar.gz
datagengo-8c6fe47d809eab3daad2e2b560295ecf4fa12796.zip
remove modest quantities of extra vocabulary that also appear in sentences
Diffstat (limited to 'src')
-rw-r--r--src/main.rs12
1 files changed, 9 insertions, 3 deletions
diff --git a/src/main.rs b/src/main.rs
index 9cc72fb..533f157 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1025,17 +1025,19 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
};
let mut done = Charset::default();
- for (i, batch) in all_batches.iter_mut().enumerate() {
+ let mut extra_vocab = vec![];
+ for (i, batch) in all_batches.iter().enumerate() {
let done_after = done.union(&batch.chars);
- batch.extra_vocab = vocab
+ let batch_extra_vocab = vocab
.iter()
.filter(|v| v.chars.inter_len(&batch.chars) > 0)
.filter(|v| match_level(batch, &v.level))
.filter(|v| v.chars.diff(&done_after).len() == 0)
- .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji)))
+ .filter(|v| !all_batches[i..std::cmp::min(all_batches.len(), i+10)].iter().any(|b| b.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))))
.cloned()
.collect::<Vec<_>>();
+ extra_vocab.push(batch_extra_vocab);
println!("---- BATCH #{:03} ----", i);
for v in batch.extra_vocab.iter() {
@@ -1044,6 +1046,10 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
done = done_after;
}
+
+ for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) {
+ batch.extra_vocab = vocab;
+ }
}
// =====================================================================