aboutsummaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-10-10 10:01:58 +0200
committerAlex Auvolat <alex@adnab.me>2023-10-10 10:01:58 +0200
commit727f5657f0a801fc68c0e04e791dcd34cc605fba (patch)
tree7c2c7700e63102075ac883ea38bd98effda3f316 /src/main.rs
parentb604583570e4de2bd5c19cf101908ad1eb9b03e6 (diff)
downloaddatagengo-727f5657f0a801fc68c0e04e791dcd34cc605fba.tar.gz
datagengo-727f5657f0a801fc68c0e04e791dcd34cc605fba.zip
rebuild example sentence list with more variety
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs20
1 files changed, 18 insertions, 2 deletions
diff --git a/src/main.rs b/src/main.rs
index 4addd02..b69a2da 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1003,8 +1003,24 @@ fn add_examples(all_batches: &mut [Batch], examples: &[Example]) {
.map(|(_, ex)| ex)
.collect::<Vec<_>>();
candidates.shuffle(&mut thread_rng());
- candidates.truncate(20);
- batch.extra_examples = candidates;
+
+ batch.extra_examples.clear();
+ let mut in_batch = Charset::from_iter(batch.examples.iter().map(|x| x.chars.chars().iter().copied()).flatten());
+ let mut in_extra = Charset::default();
+ while batch.extra_examples.len() < 20 {
+ let best = candidates.iter().enumerate()
+ .max_by_key(|(_, ex)| (ex.chars.diff(&in_batch).len(), ex.chars.diff(&in_extra).len()));
+ if let Some((i, ex)) = best {
+ batch.extra_examples.push(ex.clone());
+ in_batch = in_batch.union(&ex.chars);
+ in_extra = in_extra.union(&ex.chars);
+ candidates.remove(i);
+ } else {
+ break;
+ }
+ }
+
+ batch.extra_examples.shuffle(&mut thread_rng());
println!("---- BATCH #{:03} ----", i);
for ex in batch.extra_examples.iter() {