diff options
Diffstat (limited to 'src/server.rs')
-rw-r--r-- | src/server.rs | 150 |
1 files changed, 146 insertions, 4 deletions
diff --git a/src/server.rs b/src/server.rs index 83b9151..58c4dee 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1,10 +1,11 @@ use std::fs; +use std::io::Write; use std::sync::Arc; use anyhow::anyhow; +use rand::prelude::*; use http_types::mime; -use tide::prelude::*; use tide::Request; use crate::datafiles::*; @@ -14,6 +15,27 @@ use crate::*; pub async fn server_main() -> tide::Result<()> { // ---- load data files ---- + eprintln!("Loading kanji levels..."); + let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); + let all_kanji = Charset::new( + kanji_levels + .iter() + .map(|(_, x)| x.to_string()) + .collect::<Vec<_>>() + .join(""), + ); + + eprintln!("Loading examples..."); + let mut examples = read_examples(&all_kanji).expect("read_examples"); + examples.retain(|e| (5..=25).contains(&e.ja.chars().count())); + let examples = Box::leak(examples.into_boxed_slice()); + + eprintln!("Counting chars in examples..."); + let example_freq = calc_example_freq(&examples); + + eprintln!("Loading furigana overrides..."); + let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); + eprintln!("Loading JMdict_e.xml..."); let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); let jmdict_raw: &'static str = String::leak(jmdict_raw); @@ -36,9 +58,6 @@ pub async fn server_main() -> tide::Result<()> { let batches = read_batches().expect("read/parse"); let batches = Box::leak(batches.into_boxed_slice()); - eprintln!("Loading kanji levels..."); - let kanji_levels = read_kanji_levels().expect("read_kanji_levels"); - let mut index_bytes = Vec::new(); format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap(); let index = String::leak(String::from_utf8(index_bytes).unwrap()); @@ -51,6 +70,9 @@ pub async fn server_main() -> tide::Result<()> { jmdict_idx, batches, index, + examples, + example_freq, + furigana_overrides, }); let mut app = tide::with_state(state); @@ -60,6 +82,7 @@ pub async fn server_main() -> tide::Result<()> { app.at("/index.html").get(home_page); app.at("/style.css").serve_file("static/style.css")?; app.at("/about.html").get(about_page); + app.at("/ex/:start/:end").get(gen_examples_page); app.at("/:batch").get(batch_page); // ---- serve actual http ---- @@ -78,6 +101,9 @@ struct StateStruct { jmdict_idx: DictIndex<'static>, batches: &'static [Batch], index: &'static str, + examples: &'static [Example], + example_freq: HashMap<char, usize>, + furigana_overrides: HashMap<String, String>, } async fn home_page(req: Request<State>) -> tide::Result { @@ -122,3 +148,119 @@ async fn batch_page(req: Request<State>) -> tide::Result { .content_type(mime::HTML) .build()) } + +async fn gen_examples_page(req: Request<State>) -> tide::Result { + let first_level: usize = req.param("start")?.parse()?; + let last_level: usize = req.param("end")?.parse()?; + + let allowed_chars = Charset::from_iter( + req.state() + .batches + .get(..=last_level) + .unwrap_or_default() + .iter() + .map(|b| b.chars.iter()) + .flatten(), + ); + let needed_chars = Charset::from_iter( + req.state() + .batches + .get(first_level..=last_level) + .unwrap_or_default() + .iter() + .map(|b| b.chars.iter()) + .flatten(), + ); + + let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50); + for ex in examples.iter_mut() { + ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides); + } + + let mut buf: Vec<u8> = vec![]; + for ex in examples.iter() { + write!(&mut buf, "<p>{}</p>", ex.furigana_markup())?; + } + + Ok(tide::Response::builder(200) + .body(buf) + .content_type(mime::HTML) + .build()) +} + +// ---- example calculation ---- + +fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> { + let mut ret = HashMap::new(); + for ex in examples.iter() { + for c in ex.chars.iter() { + *ret.entry(c).or_default() += 1; + } + } + ret +} + +fn gen_examples( + data: &StateStruct, + allowed_chars: &Charset, + needed_chars: &Charset, + count: usize, +) -> Vec<Example> { + let mut rng = thread_rng(); + let mut ret = vec![]; + + let mut candidates = data + .examples + .iter() + .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars)) + .map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap())) + .collect::<Vec<_>>(); + let mut remaining_needed = needed_chars.clone(); + + let mut have_chars = Charset::new(""); + println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars"); + while ret.len() < count { + let mut selection = None; + let mut total_weight = 0f64; + + let mut counted = 0; + for (i, (x, f)) in candidates.iter().enumerate() { + if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) { + continue; + } + + counted += 1; + // compensate twice for rare characters + // - once to bring all chars to equal probability of sampling + // - once to over-sample rare chars because we need to see them more + let weight = 1f64 / (*f * *f) as f64; + total_weight += weight; + let rand: f64 = rng.gen(); + if rand < weight / total_weight { + selection = Some((i, *f)) + } + } + + if let Some((i, f)) = selection { + let (ex, _) = candidates.remove(i); + remaining_needed = remaining_needed.diff(&ex.chars); + have_chars = have_chars.union(&ex.chars); + ret.push(ex.clone()); + + println!( + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + ret.len(), + f, + have_chars.len(), + remaining_needed.len(), + allowed_chars.len(), + counted, + ex.chars.to_string() + ); + } else { + break; + } + } + + ret +} |