From 4b20865335f14d086e5733fd2bd62acf22434e04 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 15 Nov 2023 19:17:27 +0100 Subject: again try add more diversity to extra examples --- src/main.rs | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/main.rs b/src/main.rs index f78bf81..9209fd7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -992,6 +992,13 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { println!("---- BATCH #{:03} ----", i); chars = chars.union(&batch.chars); + // Count characters in batch in char_seen_count as a lot + for ex in batch.examples.iter() { + for c in ex.chars.iter() { + *char_seen_count.entry(c).or_default() += 5; + } + } + // Take only examples that: // - contain kanji of this batch // - only contain kanji of this or previous batches @@ -1042,53 +1049,54 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt), ) }; - let c1 = fc(1); - let c2 = fc(2); - let c4 = fc(4); - let c7 = fc(7); + let c1 = fc(5); + let c2 = fc(6); + let c3 = fc(7); + let c4 = fc(10); let best = candidates .iter() .enumerate() + .filter(|(_, ex)| { + batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0 + }) .map(|(i, ex)| { let weight = ( ex.chars.inter_len(&c0), ex.chars.inter_len(&c1), ex.chars.inter_len(&c2), + ex.chars.inter_len(&c3), ex.chars.inter_len(&c4), - ex.chars.inter_len(&c7), ex.chars.diff(&in_batch_extra).len(), ); (i, ex, weight) }) .max_by_key(|(_, _, w)| *w); if let Some((i, ex, w)) = best { - if ex.chars.diff(&in_batch_extra).len() > 0 || batch.extra_examples.len() < 20 { - println!("{:?}\t{} - {}", w, ex.ja, ex.en); + println!("{:?}\t{} - {}", w, ex.ja, ex.en); - batch.extra_examples.push(ex.clone()); - in_batch = in_batch.union(&ex.chars); - in_batch_extra = in_batch_extra.union(&ex.chars); + batch.extra_examples.push(ex.clone()); + in_batch = in_batch.union(&ex.chars); + in_batch_extra = in_batch_extra.union(&ex.chars); - for c in ex.chars.iter() { - *char_seen_count.entry(c).or_default() += 1; - if batch.chars.contains(c) { - *batch_char_seen_count.entry(c).or_default() += 1; - } + for c in ex.chars.iter() { + *char_seen_count.entry(c).or_default() += 1; + if batch.chars.contains(c) { + *batch_char_seen_count.entry(c).or_default() += 1; } - - candidates.remove(i); - continue; } + + candidates.remove(i); + } else { + break; } - break; } batch .extra_examples .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); - for i in 1..10 { + for i in 1..20 { println!( "Seen {:02}: {}", i, @@ -1097,7 +1105,7 @@ fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { } println!( "Seen more: {}", - char_seen_count.iter().filter(|(_, v)| **v >= 10).count() + char_seen_count.iter().filter(|(_, v)| **v >= 20).count() ); } } -- cgit v1.2.3