From d9078f7674c637dd8498ece74ffe6cb7d1e179b9 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 11 Oct 2023 12:15:22 +0200 Subject: add even more example sentences --- src/main.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 7ecc195..66f1f51 100644 --- a/src/main.rs +++ b/src/main.rs @@ -152,7 +152,7 @@ fn main() { .and_then(|x| Ok(serde_json::from_slice(&x)?)) .expect("failed to decode batches.json"); - add_examples(&mut batches, &ex); + add_extra_examples(&mut batches, &ex); fs::write( "data/batches.json", @@ -984,7 +984,7 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { } } -fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { +fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { let mut chars = Charset::default(); for (i, batch) in all_batches.iter_mut().enumerate() { chars = chars.union(&batch.chars); @@ -1007,17 +1007,20 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { batch.extra_examples.clear(); let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten()); let mut in_extra = Charset::default(); - while batch.extra_examples.len() < 20 { + while batch.extra_examples.len() < 40 { let best = candidates.iter().enumerate() - .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())); - if let Some((i, ex)) = best { - batch.extra_examples.push(ex.clone()); - in_batch = in_batch.union(&ex.chars); - in_extra = in_extra.union(&ex.chars); - candidates.remove(i); - } else { - break; + .map(|(i, ex)| (i, ex, ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())) + .max_by_key(|(_, _, w1, w2)| (*w1, *w2)); + if let Some((i, ex, w1, w2)) = best { + if w1 > 0 || w2 > 0 || batch.extra_examples.len() < 20 { + batch.extra_examples.push(ex.clone()); + in_batch = in_batch.union(&ex.chars); + in_extra = in_extra.union(&ex.chars); + candidates.remove(i); + continue; + } } + break; } batch.extra_examples.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); -- cgit v1.2.3