From 727f5657f0a801fc68c0e04e791dcd34cc605fba Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Tue, 10 Oct 2023 10:01:58 +0200 Subject: rebuild example sentence list with more variety --- src/main.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index 4addd02..b69a2da 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1003,8 +1003,24 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { .map(|(_, ex)| ex) .collect::>(); candidates.shuffle(&mut thread_rng()); - candidates.truncate(20); - batch.extra_examples = candidates; + + batch.extra_examples.clear(); + let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten()); + let mut in_extra = Charset::default(); + while batch.extra_examples.len() < 20 { + let best = candidates.iter().enumerate() + .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())); + if let Some((i, ex)) = best { + batch.extra_examples.push(ex.clone()); + in_batch = in_batch.union(&ex.chars); + in_extra = in_extra.union(&ex.chars); + candidates.remove(i); + } else { + break; + } + } + + batch.extra_examples.shuffle(&mut thread_rng()); println!("---- BATCH #{:03} ----", i); for ex in batch.extra_examples.iter() { -- cgit v1.2.3