use std::fs;
use std::io::Write;
use std::sync::Arc;
use anyhow::anyhow;
use rand::prelude::*;
use http_types::mime;
use tide::Request;
use crate::datafiles::*;
use crate::format::*;
use crate::*;
pub async fn server_main() -> tide::Result<()> {
// ---- load data files ----
eprintln!("Loading kanji levels...");
let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
let all_kanji = Charset::new(
kanji_levels
.iter()
.map(|(_, x)| x.to_string())
.collect::<Vec<_>>()
.join(""),
);
eprintln!("Loading examples...");
let mut examples = read_examples(&all_kanji).expect("read_examples");
examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
let examples = Box::leak(examples.into_boxed_slice());
eprintln!("Counting chars in examples...");
let example_freq = calc_example_freq(&examples);
eprintln!("Loading furigana overrides...");
let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
eprintln!("Loading JMdict_e.xml...");
let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
let jmdict_raw: &'static str = String::leak(jmdict_raw);
eprintln!("Parsing JMdict_e.xml...");
let jmdict = roxmltree::Document::parse_with_options(
&jmdict_raw,
roxmltree::ParsingOptions {
allow_dtd: true,
..Default::default()
},
)
.expect("parse_jmdict");
let jmdict_xml = Box::leak(Box::new(jmdict));
eprintln!("Indexing JMdict_e.xml...");
let jmdict_idx = index_jmdict(jmdict_xml);
eprintln!("Loading batches.json...");
let batches = read_batches().expect("read/parse");
let batches = Box::leak(batches.into_boxed_slice());
let mut index_bytes = Vec::new();
format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap();
let index = String::leak(String::from_utf8(index_bytes).unwrap());
// ---- setup http server ----
let state = Arc::new(StateStruct {
jmdict_raw,
jmdict_xml,
jmdict_idx,
batches,
index,
examples,
example_freq,
furigana_overrides,
});
let mut app = tide::with_state(state);
app.with(tide::log::LogMiddleware::new());
app.at("/").get(home_page);
app.at("/index.html").get(home_page);
app.at("/style.css").serve_file("static/style.css")?;
app.at("/about.html").get(about_page);
app.at("/ex/:start/:end").get(gen_examples_page);
app.at("/:batch").get(batch_page);
// ---- serve actual http ----
eprintln!("Server listening on 127.0.0.1:8080");
app.listen("127.0.0.1:8080").await?;
Ok(())
}
type State = Arc<StateStruct>;
#[allow(dead_code)]
struct StateStruct {
jmdict_raw: &'static str,
jmdict_xml: &'static roxmltree::Document<'static>,
jmdict_idx: DictIndex<'static>,
batches: &'static [Batch],
index: &'static str,
examples: &'static [Example],
example_freq: HashMap<char, usize>,
furigana_overrides: HashMap<String, String>,
}
async fn home_page(req: Request<State>) -> tide::Result {
Ok(tide::Response::builder(200)
.body(req.state().index)
.content_type(mime::HTML)
.build())
}
async fn about_page(_req: Request<State>) -> tide::Result {
let mut about = Vec::new();
format_about_to(&mut about)?;
Ok(tide::Response::builder(200)
.body(about)
.content_type(mime::HTML)
.build())
}
async fn batch_page(req: Request<State>) -> tide::Result {
let batch_idx = req.param("batch")?;
let batch_idx: usize = batch_idx
.strip_suffix(".html")
.unwrap_or(batch_idx)
.parse()?;
let batch = req
.state()
.batches
.get(batch_idx)
.ok_or(anyhow!("this batch number does not exist"))?;
let mut buf = vec![];
format_batch_to(
&mut buf,
&req.state().jmdict_idx,
req.state().batches.len(),
batch_idx,
batch,
)?;
Ok(tide::Response::builder(200)
.body(buf)
.content_type(mime::HTML)
.build())
}
async fn gen_examples_page(req: Request<State>) -> tide::Result {
let first_level: usize = req.param("start")?.parse()?;
let last_level: usize = req.param("end")?.parse()?;
let allowed_chars = Charset::from_iter(
req.state()
.batches
.get(..=last_level)
.unwrap_or_default()
.iter()
.map(|b| b.chars.iter())
.flatten(),
);
let needed_chars = Charset::from_iter(
req.state()
.batches
.get(first_level..=last_level)
.unwrap_or_default()
.iter()
.map(|b| b.chars.iter())
.flatten(),
);
let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50);
for ex in examples.iter_mut() {
ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);
}
let mut buf: Vec<u8> = vec![];
for ex in examples.iter() {
write!(&mut buf, "<p>{}</p>", ex.furigana_markup())?;
}
Ok(tide::Response::builder(200)
.body(buf)
.content_type(mime::HTML)
.build())
}
// ---- example calculation ----
fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> {
let mut ret = HashMap::new();
for ex in examples.iter() {
for c in ex.chars.iter() {
*ret.entry(c).or_default() += 1;
}
}
ret
}
fn gen_examples(
data: &StateStruct,
allowed_chars: &Charset,
needed_chars: &Charset,
count: usize,
) -> Vec<Example> {
let mut rng = thread_rng();
let mut ret = vec![];
let mut candidates = data
.examples
.iter()
.filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars))
.map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap()))
.collect::<Vec<_>>();
let mut remaining_needed = needed_chars.clone();
let mut have_chars = Charset::new("");
println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
while ret.len() < count {
let mut selection = None;
let mut total_weight = 0f64;
let mut counted = 0;
for (i, (x, f)) in candidates.iter().enumerate() {
if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) {
continue;
}
counted += 1;
// compensate twice for rare characters
// - once to bring all chars to equal probability of sampling
// - once to over-sample rare chars because we need to see them more
let weight = 1f64 / (*f * *f) as f64;
total_weight += weight;
let rand: f64 = rng.gen();
if rand < weight / total_weight {
selection = Some((i, *f))
}
}
if let Some((i, f)) = selection {
let (ex, _) = candidates.remove(i);
remaining_needed = remaining_needed.diff(&ex.chars);
have_chars = have_chars.union(&ex.chars);
ret.push(ex.clone());
println!(
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
ret.len(),
f,
have_chars.len(),
remaining_needed.len(),
allowed_chars.len(),
counted,
ex.chars.to_string()
);
} else {
break;
}
}
ret
}