diff options
Diffstat (limited to 'src/main.rs')
-rw-r--r-- | src/main.rs | 177 |
1 files changed, 0 insertions, 177 deletions
diff --git a/src/main.rs b/src/main.rs index 252740b..1ad5e77 100644 --- a/src/main.rs +++ b/src/main.rs @@ -37,8 +37,6 @@ enum Cmd { Simplify, Cleanup, AddVocab, - AddExamples, - AddFurigana, Format, Server, } @@ -116,53 +114,6 @@ async fn main() { save_batches(batches).expect("save_batches"); } - Cmd::AddExamples => { - let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - let all_kanji = Charset::new( - kanji_levels - .iter() - .map(|(_, x)| x.to_string()) - .collect::<Vec<_>>() - .join(""), - ); - - let mut ex = read_examples(&all_kanji).expect("read_examples"); - ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); - - let mut batches = read_batches().expect("read_batches"); - - add_extra_examples(&mut batches, &ex); - - save_batches(batches).expect("save_batches"); - } - Cmd::AddFurigana => { - let mut batches = read_batches().expect("read_batches"); - - let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict = roxmltree::Document::parse_with_options( - &jmdict, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - let jmdict_idx = index_jmdict(&jmdict); - - let overrides = read_furigana_overrides().expect("read_furigana_overrides"); - - for batch in batches.iter_mut() { - for ex in batch - .examples - .iter_mut() - .chain(batch.extra_examples.iter_mut()) - { - ex.gen_furigana(&jmdict_idx, &overrides); - } - } - - save_batches(batches).expect("save_batches"); - } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( @@ -229,8 +180,6 @@ pub struct Batch { pub examples: Vec<Example>, #[serde(default)] pub extra_vocab: Vec<JlptVocab>, - #[serde(default)] - pub extra_examples: Vec<Example>, } fn gen_batches( @@ -688,129 +637,3 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { batch.extra_vocab = vocab; } } - -fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { - let mut chars = Charset::default(); - let mut char_seen_count: HashMap<char, usize> = HashMap::new(); - - for (i, batch) in all_batches.iter_mut().enumerate() { - println!("---- BATCH #{:03} ----", i); - chars = chars.union(&batch.chars); - - // Count characters in batch in char_seen_count as a lot - for ex in batch.examples.iter() { - for c in ex.chars.iter() { - *char_seen_count.entry(c).or_default() += 5; - } - } - - // Take only examples that: - // - contain kanji of this batch - // - only contain kanji of this or previous batches - // - are not in the batch's main example sentences - let candidates = examples - .iter() - .filter(|x| x.chars.inter_len(&batch.chars) > 0) - .filter(|x| x.chars.diff(&chars).len() == 0) - .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja)); - - // Take only one candidate sentence for each possible set of represented kanji - let mut cand_by_chars = HashMap::new(); - for c in candidates { - cand_by_chars.insert(c.chars.to_string(), c.clone()); - } - let mut candidates = cand_by_chars - .into_iter() - .map(|(_, ex)| ex) - .collect::<Vec<_>>(); - - // Sorte candidates in a deterministic random order - candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); - - batch.extra_examples.clear(); - - let mut batch_char_seen_count: HashMap<char, usize> = HashMap::new(); - let mut in_batch = - Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten()); - let mut in_batch_extra = Charset::default(); - - while batch.extra_examples.len() < 40 { - let batch_min_seen = batch - .chars - .iter() - .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0)) - .min() - .unwrap(); - // Target chars: chars of the batch that have the less examples - let c0 = - Charset::from_iter(batch.chars.iter().filter(|x| { - batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen - })); - // Target chars: chars that have been seen less than cnt times - let fc = |cnt| { - Charset::from_iter( - chars - .iter() - .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt), - ) - }; - let c1 = fc(5); - let c2 = fc(6); - let c3 = fc(7); - let c4 = fc(10); - - let best = candidates - .iter() - .enumerate() - .filter(|(_, ex)| { - batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0 - }) - .map(|(i, ex)| { - let weight = ( - ex.chars.inter_len(&c0), - ex.chars.inter_len(&c1), - ex.chars.inter_len(&c2), - ex.chars.inter_len(&c3), - ex.chars.inter_len(&c4), - ex.chars.diff(&in_batch_extra).len(), - ); - (i, ex, weight) - }) - .max_by_key(|(_, _, w)| *w); - if let Some((i, ex, w)) = best { - println!("{:?}\t{} - {}", w, ex.ja, ex.en); - - batch.extra_examples.push(ex.clone()); - in_batch = in_batch.union(&ex.chars); - in_batch_extra = in_batch_extra.union(&ex.chars); - - for c in ex.chars.iter() { - *char_seen_count.entry(c).or_default() += 1; - if batch.chars.contains(c) { - *batch_char_seen_count.entry(c).or_default() += 1; - } - } - - candidates.remove(i); - } else { - break; - } - } - - batch - .extra_examples - .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); - - for i in 1..20 { - println!( - "Seen {:02}: {}", - i, - char_seen_count.iter().filter(|(_, v)| **v == i).count() - ); - } - println!( - "Seen more: {}", - char_seen_count.iter().filter(|(_, v)| **v >= 20).count() - ); - } -} |