aboutsummaryrefslogblamecommitdiff
path: root/src/server.rs
blob: 5e5d61b65d0fff11646496d9ebe5c37a46f002bf (plain) (tree)
1
2
3
4
5
6
7
8
            
 
                             
                                  
                     
                       

                     


                        
                      
                     




                                                




















                                                                                         





















                                                                                   





                                                                        
                                                       




                   


                           
        






                                                         

                                                         
                                          
                                                









                                                    
                                  






                                                      


                                                











































                                                             
 









                                                                         



















                                           



                                              

                                    




                                                     


                                                                          














                                                            

                                                            


                      



                                                            













                                                                                                  



                                                            



                                                                       



                                                                                                                                                             








                                                                                                 



                                                                                                              


                     
 





































                                                                                                                                




                                             
                                            
               




                                                        





                                

                     


                                  

                                   



                                           















                                                                    
                   



                            

                    


                                    
                               
                          




                                                                                                 









                                                              




                                                                      
                             
























                                                                                     
 
                           

                                             
                          






                                       

                                  




                  
          
 
use std::fs;

use anyhow::{anyhow, Result};
use futures::stream::TryStreamExt;
use rand::prelude::*;
use serde::Deserialize;

use http_types::mime;
use tide::Request;

use crate::datafiles::*;
use crate::example::*;
use crate::format::*;
use crate::*;

pub async fn server_main() -> tide::Result<()> {
    // ---- load data files ----

    eprintln!("Loading kanji levels...");
    let kanji_levels = read_kanji_levels().expect("read_kanji_levels");
    let all_kanji = Charset::new(
        kanji_levels
            .iter()
            .map(|(_, x)| x.to_string())
            .collect::<Vec<_>>()
            .join(""),
    );

    eprintln!("Loading examples...");
    let mut examples = read_examples(&all_kanji).expect("read_examples");
    examples.retain(|e| (5..=25).contains(&e.ja.chars().count()));
    let examples = Box::leak(examples.into_boxed_slice());

    eprintln!("Counting chars in examples...");
    let example_freq = calc_example_freq(&examples);

    eprintln!("Loading furigana overrides...");
    let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");

    eprintln!("Loading JMdict_e.xml...");
    let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
    let jmdict_raw: &'static str = String::leak(jmdict_raw);

    eprintln!("Parsing JMdict_e.xml...");
    let jmdict = roxmltree::Document::parse_with_options(
        &jmdict_raw,
        roxmltree::ParsingOptions {
            allow_dtd: true,
            ..Default::default()
        },
    )
    .expect("parse_jmdict");
    let jmdict_xml = Box::leak(Box::new(jmdict));

    eprintln!("Indexing JMdict_e.xml...");
    let jmdict_idx = index_jmdict(jmdict_xml);

    eprintln!("Loading batches.json...");
    let batches = read_batches().expect("read/parse");
    let batches = Box::leak(batches.into_boxed_slice());

    let mut index_bytes = Vec::new();
    format_index_to(&mut index_bytes, &batches, &kanji_levels).unwrap();
    let index = String::leak(String::from_utf8(index_bytes).unwrap());

    // ---- setup http server ----

    let state: State = Box::leak(Box::new(StateStruct {
        jmdict_raw,
        jmdict_xml,
        jmdict_idx,
        batches,
        index,
        examples,
        example_freq,
        furigana_overrides,
    }));

    let mut app = tide::with_state(state);
    app.with(tide::log::LogMiddleware::new());

    app.at("/").get(home_page);
    app.at("/index.html").get(home_page);
    app.at("/style.css").serve_file("static/style.css")?;
    app.at("/script.js").serve_file("static/script.js")?;
    app.at("/jquery.js").serve_file("static/jquery.js")?;
    app.at("/about.html").get(about_page);
    app.at("/gen.html").post(gen_examples_page);
    app.at("/:batch").get(batch_page);

    // ---- serve actual http ----

    eprintln!("Server listening on 127.0.0.1:8080");
    app.listen("127.0.0.1:8080").await?;

    Ok(())
}

type State = &'static StateStruct;
#[allow(dead_code)]
struct StateStruct {
    jmdict_raw: &'static str,
    jmdict_xml: &'static roxmltree::Document<'static>,
    jmdict_idx: DictIndex<'static>,
    batches: &'static [Batch],
    index: &'static str,
    examples: &'static [Example],
    example_freq: HashMap<char, usize>,
    furigana_overrides: HashMap<String, String>,
}

async fn home_page(req: Request<State>) -> tide::Result {
    Ok(tide::Response::builder(200)
        .body(req.state().index)
        .content_type(mime::HTML)
        .build())
}

async fn about_page(_req: Request<State>) -> tide::Result {
    let mut about = Vec::new();
    format_about_to(&mut about)?;
    Ok(tide::Response::builder(200)
        .body(about)
        .content_type(mime::HTML)
        .build())
}

async fn batch_page(req: Request<State>) -> tide::Result {
    let batch_idx = req.param("batch")?;
    let batch_idx: usize = batch_idx
        .strip_suffix(".html")
        .unwrap_or(batch_idx)
        .parse()?;
    let batch = req
        .state()
        .batches
        .get(batch_idx)
        .ok_or(anyhow!("this batch number does not exist"))?;

    let mut buf = vec![];
    format_batch_to(
        &mut buf,
        &req.state().jmdict_idx,
        req.state().batches.len(),
        batch_idx,
        batch,
    )?;

    Ok(tide::Response::builder(200)
        .body(buf)
        .content_type(mime::HTML)
        .build())
}

#[derive(Deserialize)]
struct GenParam {
    first_level: usize,
    last_level: usize,
}

async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
    let param: GenParam = req.body_form().await?;
    let first_level = std::cmp::min(param.first_level, param.last_level);
    let last_level = std::cmp::max(param.first_level, param.last_level);

    let allowed_chars = Charset::from_iter(
        req.state()
            .batches
            .get(..=last_level)
            .unwrap_or_default()
            .iter()
            .map(|b| b.chars.iter())
            .flatten(),
    );
    let needed_chars = Charset::from_iter(
        req.state()
            .batches
            .get(first_level..=last_level)
            .unwrap_or_default()
            .iter()
            .map(|b| b.chars.iter())
            .flatten(),
    );

    let (tx, rx) = async_channel::unbounded();

    let state: State = req.state();
    std::thread::spawn(move || {
        tx.send_blocking(Ok(format!(
            r#"
        <!DOCTYPE html>
        <html>
            <head>
                <meta charset=\"UTF-8\" />
                <title>{:03} - {:03} practice</title>
                <link rel="stylesheet" type="text/css" href="style.css" />
                <script src="jquery.js"></script>
                <script src="script.js"></script>
            </head>
            <body>
              <div class="batch_page">
                <p><a href="index.html">index</a></p>
                <p>Practice for {:03} - {:03}</p>
                <hr />
                <div id="gen_section">
                  <div id="gen_ex_cnt">
                  </div>
                  <div id="gen_ex_display">
                  </div>
                  <div id="gen_ex_en">
                  </div>
                  <div id="gen_ex_words" class="vocabtable">
                  </div>
                  <div id="gen_ex_kanji" class="vocabtable">
                  </div>
                </div>
              </div>
            </body>
        "#,
            first_level, last_level, first_level, last_level
        )
        .into_bytes()))?;

        gen_examples(state, &allowed_chars, &needed_chars, 50, |mut ex| {
            ex.gen_furigana(&req.state().jmdict_idx, &req.state().furigana_overrides);

            let mut expl = "<table>".to_string();
            for word in ex.expl.split(|c| c == ' ' || c == '~') {
                let (keb, reb) = expl_clean_word(word);
                let wchars = Charset::new(keb);
                if !wchars.intersects(&allowed_chars) {
                    continue;
                }
                if let Some(ents) = state.jmdict_idx.get(keb) {
                    for ent in ents.iter() {
                        let ent_r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
                        let ent_reb = ent_r_ele
                            .children()
                            .find(|x| x.has_tag_name("reb"))
                            .unwrap();
                        let ent_reb = ent_reb.text().unwrap().trim();
                        if reb.map(|x| x != ent_reb).unwrap_or(false) {
                            continue;
                        }
                        expl += &format!(
                            r#"<tr><td style="word-break: keep-all">&nbsp;&nbsp;<span class="tab_large font_ja">{}</span>&nbsp;&nbsp;</td><td width="50%">"#,
                            keb
                        );

                        for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
                            if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
                                if !expl.ends_with('>') {
                                    expl += "; ";
                                }
                                expl += s.text().unwrap().trim();
                            }
                        }
                        expl += &format!(
                            r#"</td><td style="word-break: keep-all" class="tab_large font_ja">{}</td></tr>"#,
                            ent_reb
                        );
                    }
                }
            }

            let mut kanji = "<table>".to_string();
            let mut chrvec = ex
                .chars
                .iter()
                .map(|chr| {
                    (
                        chr,
                        req.state()
                            .batches
                            .iter()
                            .take(last_level + 1)
                            .enumerate()
                            .flat_map(|(ib, b)| {
                                b.examples
                                    .iter()
                                    .filter(|ex| ex.chars.contains(chr))
                                    .map(move |ex| (ib, ex))
                            })
                            .collect::<Vec<_>>(),
                    )
                })
                .collect::<Vec<_>>();
            chrvec.sort_by_key(|(_, exs)| exs.len());
            for (chr, exs) in chrvec.iter().take(5) {
                for (cnt, (ib, ex)) in exs.iter().enumerate().take(4) {
                    if cnt == 0 {
                        kanji +=
                            &format!(r#"<tr><td class="tab_large font_ja">{}&nbsp;&nbsp;"#, chr);
                    } else {
                        kanji += &format!(r#"<tr><td>"#);
                    }
                    kanji += &format!(
                        r#"</td><td><a href="{:03}.html">{:03}</a>&nbsp;&nbsp;</td><td class="tab_large font_ja">{}</td></tr>"#,
                        ib, ib, ex.ja
                    );
                }
            }

            let item = serde_json::json!({
                "ja": ex.ja,
                "en": ex.en,
                "furi": ex.furigana_markup(),
                "vocab": expl + "</table>",
                "kanji": kanji + "</table>",
            });
            tx.send_blocking(Ok(format!(
                "<script> add_example({}); </script>\n",
                serde_json::to_string(&item)?
            )
            .into_bytes()))?;
            Ok(())
        })?;

        tx.send_blocking(Ok(br#"
            </body>
            </html>
        "#
        .to_vec()))?;

        Ok::<_, anyhow::Error>(())
    });

    Ok(tide::Response::builder(200)
        .body(tide::Body::from_reader(
            Box::pin(rx).into_async_read(),
            None,
        ))
        .content_type(mime::HTML)
        .build())
}

// ---- example calculation ----

fn calc_example_freq(examples: &[Example]) -> HashMap<char, usize> {
    let mut ret = HashMap::new();
    for ex in examples.iter() {
        for c in ex.chars.iter() {
            *ret.entry(c).or_default() += 1;
        }
    }
    ret
}

fn gen_examples<F>(
    data: &StateStruct,
    allowed_chars: &Charset,
    needed_chars: &Charset,
    count: usize,
    mut callback: F,
) -> Result<()>
where
    F: FnMut(Example) -> Result<()>,
{
    let mut rng = thread_rng();
    let mut generated = 0;

    let mut candidates = data
        .examples
        .iter()
        .filter(|x| x.chars.diff(&allowed_chars).is_empty() && x.chars.intersects(&needed_chars))
        .map(|ex| {
            (
                ex,
                *ex.chars
                    .iter()
                    .filter_map(|x| data.example_freq.get(&x))
                    .min()
                    .unwrap(),
            )
        })
        .collect::<Vec<_>>();
    let mut remaining_needed = needed_chars.clone();

    let mut have_chars = Charset::new("");
    println!("Ex\tMinCnt\tChars\tNeeded\tAllowed\tCandidates\tChars");
    while generated < count {
        let mut selection = None;
        let mut total_weight = 0f64;

        let mut counted = 0;
        for (i, (x, f)) in candidates.iter().enumerate() {
            if remaining_needed.len() > 0 && !x.chars.intersects(&remaining_needed) {
                continue;
            }

            counted += 1;
            // compensate twice for rare characters
            // - once to bring all chars to equal probability of sampling
            // - once to over-sample rare chars because we need to see them more
            let weight = 1f64 / (*f * *f) as f64;
            total_weight += weight;
            let rand: f64 = rng.gen();
            if rand < weight / total_weight {
                selection = Some((i, *f))
            }
        }

        if let Some((i, f)) = selection {
            let (ex, _) = candidates.remove(i);
            remaining_needed = remaining_needed.diff(&ex.chars);
            have_chars = have_chars.union(&ex.chars);

            generated += 1;
            println!(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}",
                generated,
                f,
                have_chars.len(),
                remaining_needed.len(),
                allowed_chars.len(),
                counted,
                ex.chars.to_string()
            );

            callback(ex.clone())?;
        } else {
            break;
        }
    }

    Ok(())
}