aboutsummaryrefslogtreecommitdiff
path: root/src/server.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/server.rs')
-rw-r--r--src/server.rs150
1 files changed, 146 insertions, 4 deletions
diff --git a/src/server.rs b/src/server.rs
index 83b9151..58c4dee 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -1,10 +1,11 @@
use std::fs;
+use std::io::Write;
use std::sync::Arc;
use anyhow::anyhow;
+use rand::prelude::*;
use http_types::mime;
-use tide::prelude::*;
use tide::Request;
use crate::datafiles::*;
@@ -14,6 +15,27 @@ use crate::*;
pub async fn server_main() -> tide::Result<()> {
// ---- load data files ----
+ eprintln!("Loading kanji levels...");
+ let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
+ let all_kanji = Charset::new(
+ kanji_levels
+ .iter()
+ .map(|(_, x)| x.to_string())
+ .collect::<Vec<_>>()
+ .join(""),
+ );
+
+ eprintln!("Loading examples...");
+ let mut examples = read_examples(&all_kanji).expect("read_examples");
+ examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
+ let examples = Box::leak(examples.into_boxed_slice());
+
+ eprintln!("Counting chars in examples...");
+ let example_freq = calc_example_freq(&examples);
+
+ eprintln!("Loading furigana overrides...");
+ let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
+
eprintln!("Loading JMdict_e.xml...");
let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict_raw: &'static str = String::leak(jmdict_raw);
@@ -36,9 +58,6 @@ pub async fn server_main() -> tide::Result<()> {
let batches = read_batches().expect("read/parse");
let batches = Box::leak(batches.into_boxed_slice());
- eprintln!("Loading kanji levels...");
- let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
-
let mut index_bytes = Vec::new();
format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap();
let index = String::leak(String::from_utf8(index_bytes).unwrap());
@@ -51,6 +70,9 @@ pub async fn server_main() -> tide::Result<()> {
jmdict_idx,
batches,
index,
+ examples,
+ example_freq,
+ furigana_overrides,
});
let mut app = tide::with_state(state);
@@ -60,6 +82,7 @@ pub async fn server_main() -> tide::Result<()> {
app.at("/index.html").get(home_page);
app.at("/style.css").serve_file("static/style.css")?;
app.at("/about.html").get(about_page);
+ app.at("/ex/:start/:end").get(gen_examples_page);
app.at("/:batch").get(batch_page);
// ---- serve actual http ----
@@ -78,6 +101,9 @@ struct StateStruct {
jmdict_idx: DictIndex<'static>,
batches: &'static [Batch],
index: &'static str,
+ examples: &'static [Example],
+ example_freq: HashMap<char, usize>,
+ furigana_overrides: HashMap<String, String>,
}
async fn home_page(req: Request<State>) -> tide::Result {
@@ -122,3 +148,119 @@ async fn batch_page(req: Request<State>) -> tide::Result {
.content_type(mime::HTML)
.build())
}
+
+async fn gen_examples_page(req: Request<State>) -> tide::Result {
+ let first_level: usize = req.param("start")?.parse()?;
+ let last_level: usize = req.param("end")?.parse()?;
+
+ let allowed_chars = Charset::from_iter(
+ req.state()
+ .batches
+ .get(..=last_level)
+ .unwrap_or_default()
+ .iter()
+ .map(|b| b.chars.iter())
+ .flatten(),
+ );
+ let needed_chars = Charset::from_iter(
+ req.state()
+ .batches
+ .get(first_level..=last_level)
+ .unwrap_or_default()
+ .iter()
+ .map(|b| b.chars.iter())
+ .flatten(),
+ );
+
+ let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50);
+ for ex in examples.iter_mut() {
+ ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);
+ }
+
+ let mut buf: Vec<u8> = vec![];
+ for ex in examples.iter() {
+ write!(&mut buf, "<p>{}</p>", ex.furigana_markup())?;
+ }
+
+ Ok(tide::Response::builder(200)
+ .body(buf)
+ .content_type(mime::HTML)
+ .build())
+}
+
+// ---- example calculation ----
+
+fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> {
+ let mut ret = HashMap::new();
+ for ex in examples.iter() {
+ for c in ex.chars.iter() {
+ *ret.entry(c).or_default() += 1;
+ }
+ }
+ ret
+}
+
+fn gen_examples(
+ data: &StateStruct,
+ allowed_chars: &Charset,
+ needed_chars: &Charset,
+ count: usize,
+) -> Vec<Example> {
+ let mut rng = thread_rng();
+ let mut ret = vec![];
+
+ let mut candidates = data
+ .examples
+ .iter()
+ .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars))
+ .map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap()))
+ .collect::<Vec<_>>();
+ let mut remaining_needed = needed_chars.clone();
+
+ let mut have_chars = Charset::new("");
+ println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
+ while ret.len() < count {
+ let mut selection = None;
+ let mut total_weight = 0f64;
+
+ let mut counted = 0;
+ for (i, (x, f)) in candidates.iter().enumerate() {
+ if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) {
+ continue;
+ }
+
+ counted += 1;
+ // compensate twice for rare characters
+ // - once to bring all chars to equal probability of sampling
+ // - once to over-sample rare chars because we need to see them more
+ let weight = 1f64 / (*f * *f) as f64;
+ total_weight += weight;
+ let rand: f64 = rng.gen();
+ if rand < weight / total_weight {
+ selection = Some((i, *f))
+ }
+ }
+
+ if let Some((i, f)) = selection {
+ let (ex, _) = candidates.remove(i);
+ remaining_needed = remaining_needed.diff(&ex.chars);
+ have_chars = have_chars.union(&ex.chars);
+ ret.push(ex.clone());
+
+ println!(
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}",
+ ret.len(),
+ f,
+ have_chars.len(),
+ remaining_needed.len(),
+ allowed_chars.len(),
+ counted,
+ ex.chars.to_string()
+ );
+ } else {
+ break;
+ }
+ }
+
+ ret
+}