From d80afc4b77a2d34272e29914280859c836d9efd0 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Sun, 10 Mar 2024 20:20:34 +0100 Subject: reservoir sampling one by one (slow, but works) --- src/datafiles.rs | 3 +- src/format.rs | 12 +--- src/main.rs | 177 ------------------------------------------------------- src/server.rs | 150 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 150 insertions(+), 192 deletions(-) diff --git a/src/datafiles.rs b/src/datafiles.rs index 3065fbf..7a6a5d5 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -22,7 +22,8 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== -pub type DictIndex<'a> = HashMap<&'a str, Vec>>; +pub type DictEntry<'a> = roxmltree::Node<'a, 'a>; +pub type DictIndex<'a> = HashMap<&'a str, Vec>>; pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { let dict = dict .root() diff --git a/src/format.rs b/src/format.rs index 3519b13..801611d 100644 --- a/src/format.rs +++ b/src/format.rs @@ -154,17 +154,9 @@ pub fn format_batch_to<'a>( writeln!( buf, - r#"

Extra examples (reading practice)

"# + r#"

Extra examples (reading practice)

"# )?; - for ex in batch.extra_examples.iter() { - writeln!( - buf, - r#""#, - ex.furigana_markup(), - ex.en - )?; - } - writeln!(buf, r#"
{}
{}
"#)?; + // TODO writeln!(buf, "
")?; writeln!(buf, "

\(≧▽≦)/

")?; diff --git a/src/main.rs b/src/main.rs index 252740b..1ad5e77 100644 --- a/src/main.rs +++ b/src/main.rs @@ -37,8 +37,6 @@ enum Cmd { Simplify, Cleanup, AddVocab, - AddExamples, - AddFurigana, Format, Server, } @@ -116,53 +114,6 @@ async fn main() { save_batches(batches).expect("save_batches"); } - Cmd::AddExamples => { - let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - let all_kanji = Charset::new( - kanji_levels - .iter() - .map(|(_, x)| x.to_string()) - .collect::>() - .join(""), - ); - - let mut ex = read_examples(&all_kanji).expect("read_examples"); - ex.retain(|e| (5..=25).contains(&e.ja.chars().count())); - - let mut batches = read_batches().expect("read_batches"); - - add_extra_examples(&mut batches, &ex); - - save_batches(batches).expect("save_batches"); - } - Cmd::AddFurigana => { - let mut batches = read_batches().expect("read_batches"); - - let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict = roxmltree::Document::parse_with_options( - &jmdict, - roxmltree::ParsingOptions { - allow_dtd: true, - ..Default::default() - }, - ) - .expect("parse_jmdict"); - let jmdict_idx = index_jmdict(&jmdict); - - let overrides = read_furigana_overrides().expect("read_furigana_overrides"); - - for batch in batches.iter_mut() { - for ex in batch - .examples - .iter_mut() - .chain(batch.extra_examples.iter_mut()) - { - ex.gen_furigana(&jmdict_idx, &overrides); - } - } - - save_batches(batches).expect("save_batches"); - } Cmd::Format => { let jmdict = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict = roxmltree::Document::parse_with_options( @@ -229,8 +180,6 @@ pub struct Batch { pub examples: Vec, #[serde(default)] pub extra_vocab: Vec, - #[serde(default)] - pub extra_examples: Vec, } fn gen_batches( @@ -688,129 +637,3 @@ fn add_vocab(all_batches: &mut [Batch], vocab: &[JlptVocab]) { batch.extra_vocab = vocab; } } - -fn add_extra_examples(all_batches: &mut [Batch], examples: &[Example]) { - let mut chars = Charset::default(); - let mut char_seen_count: HashMap = HashMap::new(); - - for (i, batch) in all_batches.iter_mut().enumerate() { - println!("---- BATCH #{:03} ----", i); - chars = chars.union(&batch.chars); - - // Count characters in batch in char_seen_count as a lot - for ex in batch.examples.iter() { - for c in ex.chars.iter() { - *char_seen_count.entry(c).or_default() += 5; - } - } - - // Take only examples that: - // - contain kanji of this batch - // - only contain kanji of this or previous batches - // - are not in the batch's main example sentences - let candidates = examples - .iter() - .filter(|x| x.chars.inter_len(&batch.chars) > 0) - .filter(|x| x.chars.diff(&chars).len() == 0) - .filter(|x| batch.examples.iter().all(|y| y.ja != x.ja)); - - // Take only one candidate sentence for each possible set of represented kanji - let mut cand_by_chars = HashMap::new(); - for c in candidates { - cand_by_chars.insert(c.chars.to_string(), c.clone()); - } - let mut candidates = cand_by_chars - .into_iter() - .map(|(_, ex)| ex) - .collect::>(); - - // Sorte candidates in a deterministic random order - candidates.sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); - - batch.extra_examples.clear(); - - let mut batch_char_seen_count: HashMap = HashMap::new(); - let mut in_batch = - Charset::from_iter(batch.examples.iter().map(|x| x.chars.iter()).flatten()); - let mut in_batch_extra = Charset::default(); - - while batch.extra_examples.len() < 40 { - let batch_min_seen = batch - .chars - .iter() - .map(|x| batch_char_seen_count.get(&x).copied().unwrap_or(0)) - .min() - .unwrap(); - // Target chars: chars of the batch that have the less examples - let c0 = - Charset::from_iter(batch.chars.iter().filter(|x| { - batch_char_seen_count.get(x).copied().unwrap_or(0) == batch_min_seen - })); - // Target chars: chars that have been seen less than cnt times - let fc = |cnt| { - Charset::from_iter( - chars - .iter() - .filter(|x| char_seen_count.get(x).copied().unwrap_or(0) <= cnt), - ) - }; - let c1 = fc(5); - let c2 = fc(6); - let c3 = fc(7); - let c4 = fc(10); - - let best = candidates - .iter() - .enumerate() - .filter(|(_, ex)| { - batch.extra_examples.len() < 20 || ex.chars.diff(&in_batch_extra).len() > 0 - }) - .map(|(i, ex)| { - let weight = ( - ex.chars.inter_len(&c0), - ex.chars.inter_len(&c1), - ex.chars.inter_len(&c2), - ex.chars.inter_len(&c3), - ex.chars.inter_len(&c4), - ex.chars.diff(&in_batch_extra).len(), - ); - (i, ex, weight) - }) - .max_by_key(|(_, _, w)| *w); - if let Some((i, ex, w)) = best { - println!("{:?}\t{} - {}", w, ex.ja, ex.en); - - batch.extra_examples.push(ex.clone()); - in_batch = in_batch.union(&ex.chars); - in_batch_extra = in_batch_extra.union(&ex.chars); - - for c in ex.chars.iter() { - *char_seen_count.entry(c).or_default() += 1; - if batch.chars.contains(c) { - *batch_char_seen_count.entry(c).or_default() += 1; - } - } - - candidates.remove(i); - } else { - break; - } - } - - batch - .extra_examples - .sort_by_key(|ex| fasthash::metro::hash64(ex.ja.as_bytes())); - - for i in 1..20 { - println!( - "Seen {:02}: {}", - i, - char_seen_count.iter().filter(|(_, v)| **v == i).count() - ); - } - println!( - "Seen more: {}", - char_seen_count.iter().filter(|(_, v)| **v >= 20).count() - ); - } -} diff --git a/src/server.rs b/src/server.rs index 83b9151..58c4dee 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,10 +1,11 @@ use std::fs; +use std::io::Write; use std::sync::Arc; use anyhow::anyhow; +use rand::prelude::*; use http_types::mime; -use tide::prelude::*; use tide::Request; use crate::datafiles::*; @@ -14,6 +15,27 @@ use crate::*; pub async fn server_main() -> tide::Result<()> { // ---- load data files ---- + eprintln!("Loading kanji levels..."); + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); + let all_kanji = Charset::new( + kanji_levels + .iter() + .map(|(_, x)| x.to_string()) + .collect::>() + .join(""), + ); + + eprintln!("Loading examples..."); + let mut examples = read_examples(&all_kanji).expect("read_examples"); + examples.retain(|e| (5..=25).contains(&e.ja.chars().count())); + let examples = Box::leak(examples.into_boxed_slice()); + + eprintln!("Counting chars in examples..."); + let example_freq = calc_example_freq(&examples); + + eprintln!("Loading furigana overrides..."); + let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); + eprintln!("Loading JMdict_e.xml..."); let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict_raw: &'static str = String::leak(jmdict_raw); @@ -36,9 +58,6 @@ pub async fn server_main() -> tide::Result<()> { let batches = read_batches().expect("read/parse"); let batches = Box::leak(batches.into_boxed_slice()); - eprintln!("Loading kanji levels..."); - let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - let mut index_bytes = Vec::new(); format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap(); let index = String::leak(String::from_utf8(index_bytes).unwrap()); @@ -51,6 +70,9 @@ pub async fn server_main() -> tide::Result<()> { jmdict_idx, batches, index, + examples, + example_freq, + furigana_overrides, }); let mut app = tide::with_state(state); @@ -60,6 +82,7 @@ pub async fn server_main() -> tide::Result<()> { app.at("/index.html").get(home_page); app.at("/style.css").serve_file("static/style.css")?; app.at("/about.html").get(about_page); + app.at("/ex/:start/:end").get(gen_examples_page); app.at("/:batch").get(batch_page); // ---- serve actual http ---- @@ -78,6 +101,9 @@ struct StateStruct { jmdict_idx: DictIndex<'static>, batches: &'static [Batch], index: &'static str, + examples: &'static [Example], + example_freq: HashMap, + furigana_overrides: HashMap, } async fn home_page(req: Request) -> tide::Result { @@ -122,3 +148,119 @@ async fn batch_page(req: Request) -> tide::Result { .content_type(mime::HTML) .build()) } + +async fn gen_examples_page(req: Request) -> tide::Result { + let first_level: usize = req.param("start")?.parse()?; + let last_level: usize = req.param("end")?.parse()?; + + let allowed_chars = Charset::from_iter( + req.state() + .batches + .get(..=last_level) + .unwrap_or_default() + .iter() + .map(|b| b.chars.iter()) + .flatten(), + ); + let needed_chars = Charset::from_iter( + req.state() + .batches + .get(first_level..=last_level) + .unwrap_or_default() + .iter() + .map(|b| b.chars.iter()) + .flatten(), + ); + + let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50); + for ex in examples.iter_mut() { + ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides); + } + + let mut buf: Vec = vec![]; + for ex in examples.iter() { + write!(&mut buf, "

{}

", ex.furigana_markup())?; + } + + Ok(tide::Response::builder(200) + .body(buf) + .content_type(mime::HTML) + .build()) +} + +// ---- example calculation ---- + +fn calc_example_freq(examples: &[Example]) -> HashMap { + let mut ret = HashMap::new(); + for ex in examples.iter() { + for c in ex.chars.iter() { + *ret.entry(c).or_default() += 1; + } + } + ret +} + +fn gen_examples( + data: &StateStruct, + allowed_chars: &Charset, + needed_chars: &Charset, + count: usize, +) -> Vec { + let mut rng = thread_rng(); + let mut ret = vec![]; + + let mut candidates = data + .examples + .iter() + .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars)) + .map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap())) + .collect::>(); + let mut remaining_needed = needed_chars.clone(); + + let mut have_chars = Charset::new(""); + println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); + while ret.len() < count { + let mut selection = None; + let mut total_weight = 0f64; + + let mut counted = 0; + for (i, (x, f)) in candidates.iter().enumerate() { + if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) { + continue; + } + + counted += 1; + // compensate twice for rare characters + // - once to bring all chars to equal probability of sampling + // - once to over-sample rare chars because we need to see them more + let weight = 1f64 / (*f * *f) as f64; + total_weight += weight; + let rand: f64 = rng.gen(); + if rand < weight / total_weight { + selection = Some((i, *f)) + } + } + + if let Some((i, f)) = selection { + let (ex, _) = candidates.remove(i); + remaining_needed = remaining_needed.diff(&ex.chars); + have_chars = have_chars.union(&ex.chars); + ret.push(ex.clone()); + + println!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + ret.len(), + f, + have_chars.len(), + remaining_needed.len(), + allowed_chars.len(), + counted, + ex.chars.to_string() + ); + } else { + break; + } + } + + ret +} -- cgit v1.2.3