aboutsummaryrefslogtreecommitdiff
path: root/src/main.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/main.rs')
-rw-r--r--src/main.rs177
1 files changed, 0 insertions, 177 deletions
diff --git a/src/main.rs b/src/main.rs
index 252740b..1ad5e77 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -37,8 +37,6 @@ enum Cmd {
Simplify,
Cleanup,
AddVocab,
- AddExamples,
- AddFurigana,
Format,
Server,
}
@@ -116,53 +114,6 @@ async fn main() {
save_batches(batches).expect("save_batches");
}
- Cmd::AddExamples => {
- let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
- let all_kanji = Charset::new(
- kanji_levels
- .iter()
- .map(|(_, x)| x.to_string())
- .collect::<Vec<_>>()
- .join(""),
- );
-
- let mut ex = read_examples(&all_kanji).expect("read_examples");
- ex.retain(|e| (5..=25).contains(&e.ja.chars().count()));
-
- let mut batches = read_batches().expect("read_batches");
-
- add_extra_examples(&mut batches, &ex);
-
- save_batches(batches).expect("save_batches");
- }
- Cmd::AddFurigana => {
- let mut batches = read_batches().expect("read_batches");
-
- let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
- let jmdict = roxmltree::Document::parse_with_options(
- &jmdict,
- roxmltree::ParsingOptions {
- allow_dtd: true,
- ..Default::default()
- },
- )
- .expect("parse_jmdict");
- let jmdict_idx = index_jmdict(&jmdict);
-
- let overrides = read_furigana_overrides().expect("read_furigana_overrides");
-
- for batch in batches.iter_mut() {
- for ex in batch
- .examples
- .iter_mut()
- .chain(batch.extra_examples.iter_mut())
- {
- ex.gen_furigana(&jmdict_idx, &overrides);
- }
- }
-
- save_batches(batches).expect("save_batches");
- }
Cmd::Format => {
let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict = roxmltree::Document::parse_with_options(
@@ -229,8 +180,6 @@ pub struct Batch {
pub examples: Vec<Example>,
#[serde(default)]
pub extra_vocab: Vec<JlptVocab>,
- #[serde(default)]
- pub extra_examples: Vec<Example>,
}
fn gen_batches(
@@ -688,129 +637,3 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) {
batch.extra_vocab = vocab;
}
}
-
-fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) {
- let mut chars = Charset::default();
- let mut char_seen_count: HashMap<char, usize> = HashMap::new();
-
- for (i, batch) in all_batches.iter_mut().enumerate() {
- println!("---- BATCH #{:03} ----", i);
- chars = chars.union(&batch.chars);
-
- // Count characters in batch in char_seen_count as a lot
- for ex in batch.examples.iter() {
- for c in ex.chars.iter() {
- *char_seen_count.entry(c).or_default() += 5;
- }
- }
-
- // Take only examples that:
- // - contain kanji of this batch
- // - only contain kanji of this or previous batches
- // - are not in the batch's main example sentences
- let candidates = examples
- .iter()
- .filter(|x| x.chars.inter_len(&batch.chars) > 0)
- .filter(|x| x.chars.diff(&chars).len() == 0)
- .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja));
-
- // Take only one candidate sentence for each possible set of represented kanji
- let mut cand_by_chars = HashMap::new();
- for c in candidates {
- cand_by_chars.insert(c.chars.to_string(), c.clone());
- }
- let mut candidates = cand_by_chars
- .into_iter()
- .map(|(_, ex)| ex)
- .collect::<Vec<_>>();
-
- // Sorte candidates in a deterministic random order
- candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
- batch.extra_examples.clear();
-
- let mut batch_char_seen_count: HashMap<char, usize> = HashMap::new();
- let mut in_batch =
- Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten());
- let mut in_batch_extra = Charset::default();
-
- while batch.extra_examples.len() < 40 {
- let batch_min_seen = batch
- .chars
- .iter()
- .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0))
- .min()
- .unwrap();
- // Target chars: chars of the batch that have the less examples
- let c0 =
- Charset::from_iter(batch.chars.iter().filter(|x| {
- batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen
- }));
- // Target chars: chars that have been seen less than cnt times
- let fc = |cnt| {
- Charset::from_iter(
- chars
- .iter()
- .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt),
- )
- };
- let c1 = fc(5);
- let c2 = fc(6);
- let c3 = fc(7);
- let c4 = fc(10);
-
- let best = candidates
- .iter()
- .enumerate()
- .filter(|(_, ex)| {
- batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0
- })
- .map(|(i, ex)| {
- let weight = (
- ex.chars.inter_len(&c0),
- ex.chars.inter_len(&c1),
- ex.chars.inter_len(&c2),
- ex.chars.inter_len(&c3),
- ex.chars.inter_len(&c4),
- ex.chars.diff(&in_batch_extra).len(),
- );
- (i, ex, weight)
- })
- .max_by_key(|(_, _, w)| *w);
- if let Some((i, ex, w)) = best {
- println!("{:?}\t{} - {}", w, ex.ja, ex.en);
-
- batch.extra_examples.push(ex.clone());
- in_batch = in_batch.union(&ex.chars);
- in_batch_extra = in_batch_extra.union(&ex.chars);
-
- for c in ex.chars.iter() {
- *char_seen_count.entry(c).or_default() += 1;
- if batch.chars.contains(c) {
- *batch_char_seen_count.entry(c).or_default() += 1;
- }
- }
-
- candidates.remove(i);
- } else {
- break;
- }
- }
-
- batch
- .extra_examples
- .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes()));
-
- for i in 1..20 {
- println!(
- "Seen {:02}: {}",
- i,
- char_seen_count.iter().filter(|(_, v)| **v == i).count()
- );
- }
- println!(
- "Seen more: {}",
- char_seen_count.iter().filter(|(_, v)| **v >= 20).count()
- );
- }
-}