From 4cd9081dc3a41594174480c1565fd2427550c50a Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Fri, 15 Mar 2024 17:36:55 +0100 Subject: reduce memory usage & fix scroll to top --- Cargo.lock | 7 +++++++ Cargo.toml | 2 ++ src/datafiles.rs | 42 ++++++++++++++++++++++++++++++++++-------- src/example.rs | 24 +++++++----------------- src/format.rs | 28 +++++++++------------------- src/main.rs | 4 ++++ src/server.rs | 39 ++++++++++++++++----------------------- static/script.js | 6 +++++- 8 files changed, 84 insertions(+), 68 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index df12506..613b503 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -479,6 +479,12 @@ version = "3.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +[[package]] +name = "cap" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f125eb85b84a24c36b02ed1d22c9dd8632f53b3cde6e4d23512f94021030003" + [[package]] name = "cc" version = "1.0.90" @@ -674,6 +680,7 @@ dependencies = [ "anyhow", "async-channel 2.2.0", "async-std", + "cap", "fasthash", "futures", "http-types", diff --git a/Cargo.toml b/Cargo.toml index 11b7e6a..db5359f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,5 @@ futures = "0.3" async-std = { version = "1.6.0", features = ["attributes"] } http-types = "2.12" async-channel = "2.0" + +cap = "0.1.2" diff --git a/src/datafiles.rs b/src/datafiles.rs index 7a6a5d5..fc6194f 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs; use std::io::{self, BufRead}; +use std::sync::Arc; use anyhow::Result; use serde::{Deserialize, Serialize}; @@ -22,21 +23,46 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== -pub type DictEntry<'a> = roxmltree::Node<'a, 'a>; -pub type DictIndex<'a> = HashMap<&'a str, Vec>>; -pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { +pub struct DictEntry { + pub reb: String, + pub ent_seq: String, + pub sense: Box<[String]>, +} + +pub type DictIndex = HashMap>>; +pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); - let mut ret: DictIndex<'a> = HashMap::new(); - for x in dict.children().filter(|x| x.has_tag_name("entry")) { - for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + let mut ret: DictIndex = HashMap::new(); + for ent in dict.children().filter(|x| x.has_tag_name("entry")) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + let reb = reb.text().unwrap().trim().to_string(); + + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + let ent_seq = ent_seq.text().unwrap().trim().to_string(); + + let sense = ent + .children() + .filter(|x| x.has_tag_name("sense")) + .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss"))) + .map(|gloss| gloss.text().unwrap().trim().to_string()) + .collect::>() + .into_boxed_slice(); + let parsed_ent = Arc::new(DictEntry { + reb, + ent_seq, + sense, + }); + + for r in ent.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { - let txt = keb.text().unwrap().trim(); - ret.entry(txt).or_default().push(x); + let txt = keb.text().unwrap().trim().to_string(); + ret.entry(txt).or_default().push(parsed_ent.clone()); } } } diff --git a/src/example.rs b/src/example.rs index 7d20a28..c52cc8f 100644 --- a/src/example.rs +++ b/src/example.rs @@ -4,11 +4,7 @@ use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana<'a>( - &mut self, - dict_idx: &DictIndex<'a>, - overrides: &HashMap, - ) { + pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap) { use std::fmt::Write; if let Some(v) = overrides.get(&self.ja) { @@ -51,13 +47,11 @@ impl Example { let reb = match reb { Some(reb) if reb.starts_with('#') => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - if let Some(ent) = ents.iter().find(|ent| { - let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() - }) { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() + if let Some(ent) = ents + .iter() + .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap()) + { + ent.reb.as_str() } else { println!("- entry id not found: {}", reb); ret += &word; @@ -69,11 +63,7 @@ impl Example { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); let matches = ents .iter() - .map(|ent| { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - }) + .map(|ent| ent.reb.as_str()) .collect::>(); if matches.len() == 1 { *matches.iter().next().unwrap() diff --git a/src/format.rs b/src/format.rs index b9da487..3f10f8f 100644 --- a/src/format.rs +++ b/src/format.rs @@ -11,16 +11,16 @@ use crate::*; // FORMATTING TO HTML // ===================================================================== -pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { +pub fn format_batch(dict_idx: &DictIndex, count: usize, (i, batch): (usize, &Batch)) { let mut f = io::BufWriter::new( fs::File::create(format!("public/{:03}.html", i)).expect("create batch file"), ); format_batch_to(&mut f, dict_idx, count, i, batch).expect("format batch"); } -pub fn format_batch_to<'a>( +pub fn format_batch_to( buf: &mut impl Write, - dict_idx: &DictIndex<'a>, + dict_idx: &DictIndex, count: usize, i: usize, batch: &Batch, @@ -200,32 +200,22 @@ fn format_vocab(buf: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<( Ok(()) } -fn dict_str_short<'a>( - qkeb: &str, - qreb: Option<&str>, - ent: &roxmltree::Node<'a, 'a>, -) -> Option { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - let reb = reb.text().unwrap().trim(); - - if qreb.map(|x| x != reb).unwrap_or(false) { +fn dict_str_short<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option { + if qreb.map(|x| x != ent.reb).unwrap_or(false) { return None; } Some(format!( r#"{} 【{}】"#, - qkeb, reb + qkeb, ent.reb )) } -fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option { +fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option { let mut ret = dict_str_short(qkeb, qreb, ent)?; - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); - } + for sense in ent.sense.iter() { + ret += &format!(" {};", sense); } if ret.chars().rev().next() == Some(';') { diff --git a/src/main.rs b/src/main.rs index 1ad5e77..b59669d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,10 @@ use charset::Charset; use datafiles::*; use format::*; +#[global_allocator] +static ALLOCATOR: cap::Cap = + cap::Cap::new(std::alloc::System, usize::max_value()); + #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] struct Opt { diff --git a/src/server.rs b/src/server.rs index 5e5d61b..14368e0 100644 --- a/src/server.rs +++ b/src/server.rs @@ -36,13 +36,14 @@ pub async fn server_main() -> tide::Result<()> { eprintln!("Loading furigana overrides..."); let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Loading JMdict_e.xml..."); let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict_raw: &'static str = String::leak(jmdict_raw); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Parsing JMdict_e.xml..."); - let jmdict = roxmltree::Document::parse_with_options( + let jmdict_xml = roxmltree::Document::parse_with_options( &jmdict_raw, roxmltree::ParsingOptions { allow_dtd: true, @@ -50,10 +51,14 @@ pub async fn server_main() -> tide::Result<()> { }, ) .expect("parse_jmdict"); - let jmdict_xml = Box::leak(Box::new(jmdict)); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Indexing JMdict_e.xml..."); - let jmdict_idx = index_jmdict(jmdict_xml); + let jmdict_idx = index_jmdict(&jmdict_xml); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); + drop(jmdict_xml); + drop(jmdict_raw); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Loading batches.json..."); let batches = read_batches().expect("read/parse"); @@ -66,8 +71,6 @@ pub async fn server_main() -> tide::Result<()> { // ---- setup http server ---- let state: State = Box::leak(Box::new(StateStruct { - jmdict_raw, - jmdict_xml, jmdict_idx, batches, index, @@ -99,9 +102,7 @@ pub async fn server_main() -> tide::Result<()> { type State = &'static StateStruct; #[allow(dead_code)] struct StateStruct { - jmdict_raw: &'static str, - jmdict_xml: &'static roxmltree::Document<'static>, - jmdict_idx: DictIndex<'static>, + jmdict_idx: DictIndex, batches: &'static [Batch], index: &'static str, examples: &'static [Example], @@ -233,13 +234,7 @@ async fn gen_examples_page(mut req: Request) -> tide::Result { } if let Some(ents) = state.jmdict_idx.get(keb) { for ent in ents.iter() { - let ent_r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let ent_reb = ent_r_ele - .children() - .find(|x| x.has_tag_name("reb")) - .unwrap(); - let ent_reb = ent_reb.text().unwrap().trim(); - if reb.map(|x| x != ent_reb).unwrap_or(false) { + if reb.map(|x| x != ent.reb).unwrap_or(false) { continue; } expl += &format!( @@ -247,17 +242,15 @@ async fn gen_examples_page(mut req: Request) -> tide::Result { keb ); - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - if !expl.ends_with('>') { - expl += "; "; - } - expl += s.text().unwrap().trim(); + for sense in ent.sense.iter() { + if !expl.ends_with('>') { + expl += "; "; } + expl += sense; } expl += &format!( r#"{}"#, - ent_reb + ent.reb ); } } diff --git a/static/script.js b/static/script.js index 639e743..10db48c 100644 --- a/static/script.js +++ b/static/script.js @@ -37,6 +37,10 @@ function display_example(i) { } } +function scrollTop() { + window.scrollTo(0, 0); +} + function spacebar() { if (revealed) { console.log("Next item"); @@ -46,5 +50,5 @@ function spacebar() { $("#gen_section").removeClass("gen_hidden"); revealed = true; } - window.scrollTo(0, 0); + window.setTimeout(scrollTop, 1); } -- cgit v1.2.3