From 8c6fe47d809eab3daad2e2b560295ecf4fa12796 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 25 Sep 2023 16:23:04 +0200 Subject: remove modest quantities of extra vocabulary that also appear in sentences --- src/main.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 9cc72fb..533f157 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1025,17 +1025,19 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { }; let mut done = Charset::default(); - for (i, batch) in all_batches.iter_mut().enumerate() { + let mut extra_vocab = vec![]; + for (i, batch) in all_batches.iter().enumerate() { let done_after = done.union(&batch.chars); - batch.extra_vocab = vocab + let batch_extra_vocab = vocab .iter() .filter(|v| v.chars.inter_len(&batch.chars) > 0) .filter(|v| match_level(batch, &v.level)) .filter(|v| v.chars.diff(&done_after).len() == 0) - .filter(|v| !batch.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji))) + .filter(|v| !all_batches[i..std::cmp::min(all_batches.len(), i+10)].iter().any(|b| b.examples.iter().any(|ex| ex.ja.contains(&v.kanji) || ex.expl.contains(&v.kanji)))) .cloned() .collect::>(); + extra_vocab.push(batch_extra_vocab); println!("---- BATCH #{:03} ----", i); for v in batch.extra_vocab.iter() { @@ -1044,6 +1046,10 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { done = done_after; } + + for (batch, vocab) in all_batches.iter_mut().zip(extra_vocab.into_iter()) { + batch.extra_vocab = vocab; + } } // ===================================================================== -- cgit v1.2.3