diff options
author | Alex Auvolat <alex@adnab.me> | 2023-10-10 10:01:58 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-10-10 10:01:58 +0200 |
commit | 727f5657f0a801fc68c0e04e791dcd34cc605fba (patch) | |
tree | 7c2c7700e63102075ac883ea38bd98effda3f316 /src | |
parent | b604583570e4de2bd5c19cf101908ad1eb9b03e6 (diff) | |
download | datagengo-727f5657f0a801fc68c0e04e791dcd34cc605fba.tar.gz datagengo-727f5657f0a801fc68c0e04e791dcd34cc605fba.zip |
rebuild example sentence list with more variety
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 20 |
1 files changed, 18 insertions, 2 deletions
diff --git a/src/main.rs b/src/main.rs index 4addd02..b69a2da 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1003,8 +1003,24 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) { .map(|(_, ex)| ex) .collect::<Vec<_>>(); candidates.shuffle(&mut thread_rng()); - candidates.truncate(20); - batch.extra_examples = candidates; + + batch.extra_examples.clear(); + let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten()); + let mut in_extra = Charset::default(); + while batch.extra_examples.len() < 20 { + let best = candidates.iter().enumerate() + .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len())); + if let Some((i, ex)) = best { + batch.extra_examples.push(ex.clone()); + in_batch = in_batch.union(&ex.chars); + in_extra = in_extra.union(&ex.chars); + candidates.remove(i); + } else { + break; + } + } + + batch.extra_examples.shuffle(&mut thread_rng()); println!("---- BATCH #{:03} ----", i); for ex in batch.extra_examples.iter() { |