aboutsummaryrefslogblamecommitdiff
path: root/src/server.rs
blob: 58c4dee97ad18806ae07a2a142dbc0d88da0c505 (plain) (tree)
1
2
3
4
5
6
7
8
            
                   


                   
                     

                     








                                                




















                                                                                         





















                                                                                   











                                                                        


                           








                                                         
                                                     

















                                                      


                                                











































                                                             



















































































































                                                                                                  
use std::fs;
use std::io::Write;
use std::sync::Arc;

use anyhow::anyhow;
use rand::prelude::*;

use http_types::mime;
use tide::Request;

use crate::datafiles::*;
use crate::format::*;
use crate::*;

pub async fn server_main() -> tide::Result<()> {
    // ---- load data files ----

    eprintln!("Loading kanji levels...");
    let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
    let all_kanji = Charset::new(
        kanji_levels
            .iter()
            .map(|(_, x)| x.to_string())
            .collect::<Vec<_>>()
            .join(""),
    );

    eprintln!("Loading examples...");
    let mut examples = read_examples(&all_kanji).expect("read_examples");
    examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
    let examples = Box::leak(examples.into_boxed_slice());

    eprintln!("Counting chars in examples...");
    let example_freq = calc_example_freq(&examples);

    eprintln!("Loading furigana overrides...");
    let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");

    eprintln!("Loading JMdict_e.xml...");
    let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
    let jmdict_raw: &'static str = String::leak(jmdict_raw);

    eprintln!("Parsing JMdict_e.xml...");
    let jmdict = roxmltree::Document::parse_with_options(
        &jmdict_raw,
        roxmltree::ParsingOptions {
            allow_dtd: true,
            ..Default::default()
        },
    )
    .expect("parse_jmdict");
    let jmdict_xml = Box::leak(Box::new(jmdict));

    eprintln!("Indexing JMdict_e.xml...");
    let jmdict_idx = index_jmdict(jmdict_xml);

    eprintln!("Loading batches.json...");
    let batches = read_batches().expect("read/parse");
    let batches = Box::leak(batches.into_boxed_slice());

    let mut index_bytes = Vec::new();
    format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap();
    let index = String::leak(String::from_utf8(index_bytes).unwrap());

    // ---- setup http server ----

    let state = Arc::new(StateStruct {
        jmdict_raw,
        jmdict_xml,
        jmdict_idx,
        batches,
        index,
        examples,
        example_freq,
        furigana_overrides,
    });

    let mut app = tide::with_state(state);
    app.with(tide::log::LogMiddleware::new());

    app.at("/").get(home_page);
    app.at("/index.html").get(home_page);
    app.at("/style.css").serve_file("static/style.css")?;
    app.at("/about.html").get(about_page);
    app.at("/ex/:start/:end").get(gen_examples_page);
    app.at("/:batch").get(batch_page);

    // ---- serve actual http ----

    eprintln!("Server listening on 127.0.0.1:8080");
    app.listen("127.0.0.1:8080").await?;

    Ok(())
}

type State = Arc<StateStruct>;
#[allow(dead_code)]
struct StateStruct {
    jmdict_raw: &'static str,
    jmdict_xml: &'static roxmltree::Document<'static>,
    jmdict_idx: DictIndex<'static>,
    batches: &'static [Batch],
    index: &'static str,
    examples: &'static [Example],
    example_freq: HashMap<char, usize>,
    furigana_overrides: HashMap<String, String>,
}

async fn home_page(req: Request<State>) -> tide::Result {
    Ok(tide::Response::builder(200)
        .body(req.state().index)
        .content_type(mime::HTML)
        .build())
}

async fn about_page(_req: Request<State>) -> tide::Result {
    let mut about = Vec::new();
    format_about_to(&mut about)?;
    Ok(tide::Response::builder(200)
        .body(about)
        .content_type(mime::HTML)
        .build())
}

async fn batch_page(req: Request<State>) -> tide::Result {
    let batch_idx = req.param("batch")?;
    let batch_idx: usize = batch_idx
        .strip_suffix(".html")
        .unwrap_or(batch_idx)
        .parse()?;
    let batch = req
        .state()
        .batches
        .get(batch_idx)
        .ok_or(anyhow!("this batch number does not exist"))?;

    let mut buf = vec![];
    format_batch_to(
        &mut buf,
        &req.state().jmdict_idx,
        req.state().batches.len(),
        batch_idx,
        batch,
    )?;

    Ok(tide::Response::builder(200)
        .body(buf)
        .content_type(mime::HTML)
        .build())
}

async fn gen_examples_page(req: Request<State>) -> tide::Result {
    let first_level: usize = req.param("start")?.parse()?;
    let last_level: usize = req.param("end")?.parse()?;

    let allowed_chars = Charset::from_iter(
        req.state()
            .batches
            .get(..=last_level)
            .unwrap_or_default()
            .iter()
            .map(|b| b.chars.iter())
            .flatten(),
    );
    let needed_chars = Charset::from_iter(
        req.state()
            .batches
            .get(first_level..=last_level)
            .unwrap_or_default()
            .iter()
            .map(|b| b.chars.iter())
            .flatten(),
    );

    let mut examples = gen_examples(&req.state(), &allowed_chars, &needed_chars, 50);
    for ex in examples.iter_mut() {
        ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);
    }

    let mut buf: Vec<u8> = vec![];
    for ex in examples.iter() {
        write!(&mut buf, "<p>{}</p>", ex.furigana_markup())?;
    }

    Ok(tide::Response::builder(200)
        .body(buf)
        .content_type(mime::HTML)
        .build())
}

// ---- example calculation ----

fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> {
    let mut ret = HashMap::new();
    for ex in examples.iter() {
        for c in ex.chars.iter() {
            *ret.entry(c).or_default() += 1;
        }
    }
    ret
}

fn gen_examples(
    data: &StateStruct,
    allowed_chars: &Charset,
    needed_chars: &Charset,
    count: usize,
) -> Vec<Example> {
    let mut rng = thread_rng();
    let mut ret = vec![];

    let mut candidates = data
        .examples
        .iter()
        .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars))
        .map(|ex| (ex, *ex.chars.iter().filter_map(|x| data.example_freq.get(&x)).min().unwrap()))
        .collect::<Vec<_>>();
    let mut remaining_needed = needed_chars.clone();

    let mut have_chars = Charset::new("");
    println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
    while ret.len() < count {
        let mut selection = None;
        let mut total_weight = 0f64;

        let mut counted = 0;
        for (i, (x, f)) in candidates.iter().enumerate() {
            if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) {
                continue;
            }

            counted += 1;
            // compensate twice for rare characters
            // - once to bring all chars to equal probability of sampling
            // - once to over-sample rare chars because we need to see them more
            let weight = 1f64 / (*f * *f) as f64;
            total_weight += weight;
            let rand: f64 = rng.gen();
            if rand < weight / total_weight {
                selection = Some((i, *f))
            }
        }

        if let Some((i, f)) = selection {
            let (ex, _) = candidates.remove(i);
            remaining_needed = remaining_needed.diff(&ex.chars);
            have_chars = have_chars.union(&ex.chars);
            ret.push(ex.clone());

            println!(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}",
                ret.len(),
                f,
                have_chars.len(),
                remaining_needed.len(),
                allowed_chars.len(),
                counted,
                ex.chars.to_string()
            );
        } else {
            break;
        }
    }

    ret
}