diff options
author | Alex Auvolat <alex@adnab.me> | 2023-10-18 21:59:16 +0200 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2023-10-18 21:59:16 +0200 |
commit | 2a919c1f028d891744d37ad45664986b6cba4a5d (patch) | |
tree | cd231964696421f56b843bde109a193107c0382b /src | |
parent | fab4731ad5a4ca26beb1a342ba85eec92014c04b (diff) | |
download | datagengo-2a919c1f028d891744d37ad45664986b6cba4a5d.tar.gz datagengo-2a919c1f028d891744d37ad45664986b6cba4a5d.zip |
re-allow more diversity in examples
Diffstat (limited to 'src')
-rw-r--r-- | src/main.rs | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/src/main.rs b/src/main.rs index 04a9e80..598f147 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1025,6 +1025,7 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { .map(|x| x.chars.chars().iter().copied()) .flatten(), ); + let mut in_batch_extra = Charset::default(); while batch.extra_examples.len() < 40 { let best = candidates .iter() @@ -1051,10 +1052,11 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { }) .max_by_key(|(_, _, w1, w2, w3)| (-(*w1 as i64), *w2, -(*w3 * 100_000f32) as i64)); if let Some((i, ex, w1, w2, w3)) = best { - if w2 > 0 || batch.extra_examples.len() < 20 { + if ex.chars.diff(&in_batch_extra).len() > 0 || batch.extra_examples.len() < 20 { println!("{}\t{}\t{:.2}\t{} - {}", w1, w2, w3, ex.ja, ex.en); batch.extra_examples.push(ex.clone()); in_batch = in_batch.union(&ex.chars); + in_batch_extra = in_batch_extra.union(&ex.chars); for c in ex.chars.chars().iter() { *char_seen_count.entry(*c).or_default() += 1; if batch.chars.chars().contains(c) { |