diff options
author | Alex Auvolat <alex@adnab.me> | 2024-03-15 17:36:55 +0100 |
---|---|---|
committer | Alex Auvolat <alex@adnab.me> | 2024-03-15 17:36:55 +0100 |
commit | 4cd9081dc3a41594174480c1565fd2427550c50a (patch) | |
tree | 8a09db13b11477eaba40fa1a12347015738a5f8d /src | |
parent | f6a778698b8f0b709a7afa693ab42f8a0c314fe7 (diff) | |
download | datagengo-4cd9081dc3a41594174480c1565fd2427550c50a.tar.gz datagengo-4cd9081dc3a41594174480c1565fd2427550c50a.zip |
reduce memory usage & fix scroll to top
Diffstat (limited to 'src')
-rw-r--r-- | src/datafiles.rs | 42 | ||||
-rw-r--r-- | src/example.rs | 24 | ||||
-rw-r--r-- | src/format.rs | 28 | ||||
-rw-r--r-- | src/main.rs | 4 | ||||
-rw-r--r-- | src/server.rs | 39 |
5 files changed, 70 insertions, 67 deletions
diff --git a/src/datafiles.rs b/src/datafiles.rs index 7a6a5d5..fc6194f 100644 --- a/src/datafiles.rs +++ b/src/datafiles.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs; use std::io::{self, BufRead}; +use std::sync::Arc; use anyhow::Result; use serde::{Deserialize, Serialize}; @@ -22,21 +23,46 @@ pub struct Example { // PARSING DATA FILES // ===================================================================== -pub type DictEntry<'a> = roxmltree::Node<'a, 'a>; -pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>; -pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> { +pub struct DictEntry { + pub reb: String, + pub ent_seq: String, + pub sense: Box<[String]>, +} + +pub type DictIndex = HashMap<String, Vec<Arc<DictEntry>>>; +pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex { let dict = dict .root() .children() .find(|x| x.has_tag_name("JMdict")) .unwrap(); - let mut ret: DictIndex<'a> = HashMap::new(); - for x in dict.children().filter(|x| x.has_tag_name("entry")) { - for r in x.children().filter(|x| x.has_tag_name("k_ele")) { + let mut ret: DictIndex = HashMap::new(); + for ent in dict.children().filter(|x| x.has_tag_name("entry")) { + let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); + let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); + let reb = reb.text().unwrap().trim().to_string(); + + let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); + let ent_seq = ent_seq.text().unwrap().trim().to_string(); + + let sense = ent + .children() + .filter(|x| x.has_tag_name("sense")) + .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss"))) + .map(|gloss| gloss.text().unwrap().trim().to_string()) + .collect::<Vec<_>>() + .into_boxed_slice(); + let parsed_ent = Arc::new(DictEntry { + reb, + ent_seq, + sense, + }); + + for r in ent.children().filter(|x| x.has_tag_name("k_ele")) { if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) { - let txt = keb.text().unwrap().trim(); - ret.entry(txt).or_default().push(x); + let txt = keb.text().unwrap().trim().to_string(); + ret.entry(txt).or_default().push(parsed_ent.clone()); } } } diff --git a/src/example.rs b/src/example.rs index 7d20a28..c52cc8f 100644 --- a/src/example.rs +++ b/src/example.rs @@ -4,11 +4,7 @@ use crate::charset::Charset; use crate::*; impl Example { - pub fn gen_furigana<'a>( - &mut self, - dict_idx: &DictIndex<'a>, - overrides: &HashMap<String, String>, - ) { + pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) { use std::fmt::Write; if let Some(v) = overrides.get(&self.ja) { @@ -51,13 +47,11 @@ impl Example { let reb = match reb { Some(reb) if reb.starts_with('#') => { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); - if let Some(ent) = ents.iter().find(|ent| { - let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap(); - ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap() - }) { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() + if let Some(ent) = ents + .iter() + .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap()) + { + ent.reb.as_str() } else { println!("- entry id not found: {}", reb); ret += &word; @@ -69,11 +63,7 @@ impl Example { let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default(); let matches = ents .iter() - .map(|ent| { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - reb.text().unwrap().trim() - }) + .map(|ent| ent.reb.as_str()) .collect::<HashSet<_>>(); if matches.len() == 1 { *matches.iter().next().unwrap() diff --git a/src/format.rs b/src/format.rs index b9da487..3f10f8f 100644 --- a/src/format.rs +++ b/src/format.rs @@ -11,16 +11,16 @@ use crate::*; // FORMATTING TO HTML // ===================================================================== -pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) { +pub fn format_batch(dict_idx: &DictIndex, count: usize, (i, batch): (usize, &Batch)) { let mut f = io::BufWriter::new( fs::File::create(format!("public/{:03}.html", i)).expect("create batch file"), ); format_batch_to(&mut f, dict_idx, count, i, batch).expect("format batch"); } -pub fn format_batch_to<'a>( +pub fn format_batch_to( buf: &mut impl Write, - dict_idx: &DictIndex<'a>, + dict_idx: &DictIndex, count: usize, i: usize, batch: &Batch, @@ -200,32 +200,22 @@ fn format_vocab(buf: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<( Ok(()) } -fn dict_str_short<'a>( - qkeb: &str, - qreb: Option<&str>, - ent: &roxmltree::Node<'a, 'a>, -) -> Option<String> { - let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap(); - let reb = reb.text().unwrap().trim(); - - if qreb.map(|x| x != reb).unwrap_or(false) { +fn dict_str_short<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> { + if qreb.map(|x| x != ent.reb).unwrap_or(false) { return None; } Some(format!( r#"<span class="font_ja">{} 【{}】</span>"#, - qkeb, reb + qkeb, ent.reb )) } -fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> { +fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> { let mut ret = dict_str_short(qkeb, qreb, ent)?; - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - ret.extend(format!(" {};", s.text().unwrap().trim()).chars()); - } + for sense in ent.sense.iter() { + ret += &format!(" {};", sense); } if ret.chars().rev().next() == Some(';') { diff --git a/src/main.rs b/src/main.rs index 1ad5e77..b59669d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,10 @@ use charset::Charset; use datafiles::*; use format::*; +#[global_allocator] +static ALLOCATOR: cap::Cap<std::alloc::System> = + cap::Cap::new(std::alloc::System, usize::max_value()); + #[derive(Debug, StructOpt)] #[structopt(name = "datagengo", about = "Japanese example practice maker")] struct Opt { diff --git a/src/server.rs b/src/server.rs index 5e5d61b..14368e0 100644 --- a/src/server.rs +++ b/src/server.rs @@ -36,13 +36,14 @@ pub async fn server_main() -> tide::Result<()> { eprintln!("Loading furigana overrides..."); let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides"); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Loading JMdict_e.xml..."); let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict"); - let jmdict_raw: &'static str = String::leak(jmdict_raw); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Parsing JMdict_e.xml..."); - let jmdict = roxmltree::Document::parse_with_options( + let jmdict_xml = roxmltree::Document::parse_with_options( &jmdict_raw, roxmltree::ParsingOptions { allow_dtd: true, @@ -50,10 +51,14 @@ pub async fn server_main() -> tide::Result<()> { }, ) .expect("parse_jmdict"); - let jmdict_xml = Box::leak(Box::new(jmdict)); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Indexing JMdict_e.xml..."); - let jmdict_idx = index_jmdict(jmdict_xml); + let jmdict_idx = index_jmdict(&jmdict_xml); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); + drop(jmdict_xml); + drop(jmdict_raw); + eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024); eprintln!("Loading batches.json..."); let batches = read_batches().expect("read/parse"); @@ -66,8 +71,6 @@ pub async fn server_main() -> tide::Result<()> { // ---- setup http server ---- let state: State = Box::leak(Box::new(StateStruct { - jmdict_raw, - jmdict_xml, jmdict_idx, batches, index, @@ -99,9 +102,7 @@ pub async fn server_main() -> tide::Result<()> { type State = &'static StateStruct; #[allow(dead_code)] struct StateStruct { - jmdict_raw: &'static str, - jmdict_xml: &'static roxmltree::Document<'static>, - jmdict_idx: DictIndex<'static>, + jmdict_idx: DictIndex, batches: &'static [Batch], index: &'static str, examples: &'static [Example], @@ -233,13 +234,7 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result { } if let Some(ents) = state.jmdict_idx.get(keb) { for ent in ents.iter() { - let ent_r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap(); - let ent_reb = ent_r_ele - .children() - .find(|x| x.has_tag_name("reb")) - .unwrap(); - let ent_reb = ent_reb.text().unwrap().trim(); - if reb.map(|x| x != ent_reb).unwrap_or(false) { + if reb.map(|x| x != ent.reb).unwrap_or(false) { continue; } expl += &format!( @@ -247,17 +242,15 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result { keb ); - for sense in ent.children().filter(|x| x.has_tag_name("sense")) { - if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) { - if !expl.ends_with('>') { - expl += "; "; - } - expl += s.text().unwrap().trim(); + for sense in ent.sense.iter() { + if !expl.ends_with('>') { + expl += "; "; } + expl += sense; } expl += &format!( r#"</td><td style="word-break: keep-all" class="tab_large font_ja">{}</td></tr>"#, - ent_reb + ent.reb ); } } |