From 4cd9081dc3a41594174480c1565fd2427550c50a Mon Sep 17 00:00:00 2001
From: Alex Auvolat <alex@adnab.me>
Date: Fri, 15 Mar 2024 17:36:55 +0100
Subject: reduce memory usage & fix scroll to top

---
 Cargo.lock       |  7 +++++++
 Cargo.toml       |  2 ++
 src/datafiles.rs | 42 ++++++++++++++++++++++++++++++++++--------
 src/example.rs   | 24 +++++++-----------------
 src/format.rs    | 28 +++++++++-------------------
 src/main.rs      |  4 ++++
 src/server.rs    | 39 ++++++++++++++++-----------------------
 static/script.js |  6 +++++-
 8 files changed, 84 insertions(+), 68 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index df12506..613b503 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -479,6 +479,12 @@ version = "3.15.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa"
 
+[[package]]
+name = "cap"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f125eb85b84a24c36b02ed1d22c9dd8632f53b3cde6e4d23512f94021030003"
+
 [[package]]
 name = "cc"
 version = "1.0.90"
@@ -674,6 +680,7 @@ dependencies = [
  "anyhow",
  "async-channel 2.2.0",
  "async-std",
+ "cap",
  "fasthash",
  "futures",
  "http-types",
diff --git a/Cargo.toml b/Cargo.toml
index 11b7e6a..db5359f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,3 +22,5 @@ futures = "0.3"
 async-std = { version = "1.6.0", features = ["attributes"] }
 http-types = "2.12"
 async-channel = "2.0"
+
+cap = "0.1.2"
diff --git a/src/datafiles.rs b/src/datafiles.rs
index 7a6a5d5..fc6194f 100644
--- a/src/datafiles.rs
+++ b/src/datafiles.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 use std::fs;
 use std::io::{self, BufRead};
+use std::sync::Arc;
 
 use anyhow::Result;
 use serde::{Deserialize, Serialize};
@@ -22,21 +23,46 @@ pub struct Example {
 //                      PARSING DATA FILES
 // =====================================================================
 
-pub type DictEntry<'a> = roxmltree::Node<'a, 'a>;
-pub type DictIndex<'a> = HashMap<&'a str, Vec<DictEntry<'a>>>;
-pub fn index_jmdict<'a>(dict: &'a roxmltree::Document) -> DictIndex<'a> {
+pub struct DictEntry {
+    pub reb: String,
+    pub ent_seq: String,
+    pub sense: Box<[String]>,
+}
+
+pub type DictIndex = HashMap<String, Vec<Arc<DictEntry>>>;
+pub fn index_jmdict(dict: &roxmltree::Document) -> DictIndex {
     let dict = dict
         .root()
         .children()
         .find(|x| x.has_tag_name("JMdict"))
         .unwrap();
 
-    let mut ret: DictIndex<'a> = HashMap::new();
-    for x in dict.children().filter(|x| x.has_tag_name("entry")) {
-        for r in x.children().filter(|x| x.has_tag_name("k_ele")) {
+    let mut ret: DictIndex = HashMap::new();
+    for ent in dict.children().filter(|x| x.has_tag_name("entry")) {
+        let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
+        let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
+        let reb = reb.text().unwrap().trim().to_string();
+
+        let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
+        let ent_seq = ent_seq.text().unwrap().trim().to_string();
+
+        let sense = ent
+            .children()
+            .filter(|x| x.has_tag_name("sense"))
+            .filter_map(|sense| sense.children().find(|x| x.has_tag_name("gloss")))
+            .map(|gloss| gloss.text().unwrap().trim().to_string())
+            .collect::<Vec<_>>()
+            .into_boxed_slice();
+        let parsed_ent = Arc::new(DictEntry {
+            reb,
+            ent_seq,
+            sense,
+        });
+
+        for r in ent.children().filter(|x| x.has_tag_name("k_ele")) {
             if let Some(keb) = r.children().find(|x| x.has_tag_name("keb")) {
-                let txt = keb.text().unwrap().trim();
-                ret.entry(txt).or_default().push(x);
+                let txt = keb.text().unwrap().trim().to_string();
+                ret.entry(txt).or_default().push(parsed_ent.clone());
             }
         }
     }
diff --git a/src/example.rs b/src/example.rs
index 7d20a28..c52cc8f 100644
--- a/src/example.rs
+++ b/src/example.rs
@@ -4,11 +4,7 @@ use crate::charset::Charset;
 use crate::*;
 
 impl Example {
-    pub fn gen_furigana<'a>(
-        &mut self,
-        dict_idx: &DictIndex<'a>,
-        overrides: &HashMap<String, String>,
-    ) {
+    pub fn gen_furigana(&mut self, dict_idx: &DictIndex, overrides: &HashMap<String, String>) {
         use std::fmt::Write;
 
         if let Some(v) = overrides.get(&self.ja) {
@@ -51,13 +47,11 @@ impl Example {
             let reb = match reb {
                 Some(reb) if reb.starts_with('#') => {
                     let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
-                    if let Some(ent) = ents.iter().find(|ent| {
-                        let ent_seq = ent.children().find(|x| x.has_tag_name("ent_seq")).unwrap();
-                        ent_seq.text().unwrap().trim() == reb.strip_prefix('#').unwrap()
-                    }) {
-                        let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
-                        let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
-                        reb.text().unwrap().trim()
+                    if let Some(ent) = ents
+                        .iter()
+                        .find(|ent| ent.ent_seq == reb.strip_prefix('#').unwrap())
+                    {
+                        ent.reb.as_str()
                     } else {
                         println!("- entry id not found: {}", reb);
                         ret += &word;
@@ -69,11 +63,7 @@ impl Example {
                     let ents = dict_idx.get(keb).map(|x| &x[..]).unwrap_or_default();
                     let matches = ents
                         .iter()
-                        .map(|ent| {
-                            let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
-                            let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
-                            reb.text().unwrap().trim()
-                        })
+                        .map(|ent| ent.reb.as_str())
                         .collect::<HashSet<_>>();
                     if matches.len() == 1 {
                         *matches.iter().next().unwrap()
diff --git a/src/format.rs b/src/format.rs
index b9da487..3f10f8f 100644
--- a/src/format.rs
+++ b/src/format.rs
@@ -11,16 +11,16 @@ use crate::*;
 //                          FORMATTING TO HTML
 // =====================================================================
 
-pub fn format_batch<'a>(dict_idx: &DictIndex<'a>, count: usize, (i, batch): (usize, &Batch)) {
+pub fn format_batch(dict_idx: &DictIndex, count: usize, (i, batch): (usize, &Batch)) {
     let mut f = io::BufWriter::new(
         fs::File::create(format!("public/{:03}.html", i)).expect("create batch file"),
     );
     format_batch_to(&mut f, dict_idx, count, i, batch).expect("format batch");
 }
 
-pub fn format_batch_to<'a>(
+pub fn format_batch_to(
     buf: &mut impl Write,
-    dict_idx: &DictIndex<'a>,
+    dict_idx: &DictIndex,
     count: usize,
     i: usize,
     batch: &Batch,
@@ -200,32 +200,22 @@ fn format_vocab(buf: &mut impl Write, vocab: &[&JlptVocab], t: &str) -> Result<(
     Ok(())
 }
 
-fn dict_str_short<'a>(
-    qkeb: &str,
-    qreb: Option<&str>,
-    ent: &roxmltree::Node<'a, 'a>,
-) -> Option<String> {
-    let r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
-    let reb = r_ele.children().find(|x| x.has_tag_name("reb")).unwrap();
-    let reb = reb.text().unwrap().trim();
-
-    if qreb.map(|x| x != reb).unwrap_or(false) {
+fn dict_str_short<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> {
+    if qreb.map(|x| x != ent.reb).unwrap_or(false) {
         return None;
     }
 
     Some(format!(
         r#"<span class="font_ja">{} 【{}】</span>"#,
-        qkeb, reb
+        qkeb, ent.reb
     ))
 }
 
-fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &roxmltree::Node<'a, 'a>) -> Option<String> {
+fn dict_str<'a>(qkeb: &str, qreb: Option<&str>, ent: &DictEntry) -> Option<String> {
     let mut ret = dict_str_short(qkeb, qreb, ent)?;
 
-    for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
-        if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
-            ret.extend(format!(" {};", s.text().unwrap().trim()).chars());
-        }
+    for sense in ent.sense.iter() {
+        ret += &format!(" {};", sense);
     }
 
     if ret.chars().rev().next() == Some(';') {
diff --git a/src/main.rs b/src/main.rs
index 1ad5e77..b59669d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,10 @@ use charset::Charset;
 use datafiles::*;
 use format::*;
 
+#[global_allocator]
+static ALLOCATOR: cap::Cap<std::alloc::System> =
+    cap::Cap::new(std::alloc::System, usize::max_value());
+
 #[derive(Debug, StructOpt)]
 #[structopt(name = "datagengo", about = "Japanese example practice maker")]
 struct Opt {
diff --git a/src/server.rs b/src/server.rs
index 5e5d61b..14368e0 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -36,13 +36,14 @@ pub async fn server_main() -> tide::Result<()> {
 
     eprintln!("Loading furigana overrides...");
     let furigana_overrides = read_furigana_overrides().expect("read_furigana_overrides");
+    eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
 
     eprintln!("Loading JMdict_e.xml...");
     let jmdict_raw = fs::read_to_string("data/JMdict_e.xml").expect("read_jmdict");
-    let jmdict_raw: &'static str = String::leak(jmdict_raw);
+    eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
 
     eprintln!("Parsing JMdict_e.xml...");
-    let jmdict = roxmltree::Document::parse_with_options(
+    let jmdict_xml = roxmltree::Document::parse_with_options(
         &jmdict_raw,
         roxmltree::ParsingOptions {
             allow_dtd: true,
@@ -50,10 +51,14 @@ pub async fn server_main() -> tide::Result<()> {
         },
     )
     .expect("parse_jmdict");
-    let jmdict_xml = Box::leak(Box::new(jmdict));
+    eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
 
     eprintln!("Indexing JMdict_e.xml...");
-    let jmdict_idx = index_jmdict(jmdict_xml);
+    let jmdict_idx = index_jmdict(&jmdict_xml);
+    eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
+    drop(jmdict_xml);
+    drop(jmdict_raw);
+    eprintln!("RAM: {}", ALLOCATOR.allocated() / 1024);
 
     eprintln!("Loading batches.json...");
     let batches = read_batches().expect("read/parse");
@@ -66,8 +71,6 @@ pub async fn server_main() -> tide::Result<()> {
     // ---- setup http server ----
 
     let state: State = Box::leak(Box::new(StateStruct {
-        jmdict_raw,
-        jmdict_xml,
         jmdict_idx,
         batches,
         index,
@@ -99,9 +102,7 @@ pub async fn server_main() -> tide::Result<()> {
 type State = &'static StateStruct;
 #[allow(dead_code)]
 struct StateStruct {
-    jmdict_raw: &'static str,
-    jmdict_xml: &'static roxmltree::Document<'static>,
-    jmdict_idx: DictIndex<'static>,
+    jmdict_idx: DictIndex,
     batches: &'static [Batch],
     index: &'static str,
     examples: &'static [Example],
@@ -233,13 +234,7 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
                 }
                 if let Some(ents) = state.jmdict_idx.get(keb) {
                     for ent in ents.iter() {
-                        let ent_r_ele = ent.children().find(|x| x.has_tag_name("r_ele")).unwrap();
-                        let ent_reb = ent_r_ele
-                            .children()
-                            .find(|x| x.has_tag_name("reb"))
-                            .unwrap();
-                        let ent_reb = ent_reb.text().unwrap().trim();
-                        if reb.map(|x| x != ent_reb).unwrap_or(false) {
+                        if reb.map(|x| x != ent.reb).unwrap_or(false) {
                             continue;
                         }
                         expl += &format!(
@@ -247,17 +242,15 @@ async fn gen_examples_page(mut req: Request<State>) -> tide::Result {
                             keb
                         );
 
-                        for sense in ent.children().filter(|x| x.has_tag_name("sense")) {
-                            if let Some(s) = sense.children().find(|x| x.has_tag_name("gloss")) {
-                                if !expl.ends_with('>') {
-                                    expl += "; ";
-                                }
-                                expl += s.text().unwrap().trim();
+                        for sense in ent.sense.iter() {
+                            if !expl.ends_with('>') {
+                                expl += "; ";
                             }
+                            expl += sense;
                         }
                         expl += &format!(
                             r#"</td><td style="word-break: keep-all" class="tab_large font_ja">{}</td></tr>"#,
-                            ent_reb
+                            ent.reb
                         );
                     }
                 }
diff --git a/static/script.js b/static/script.js
index 639e743..10db48c 100644
--- a/static/script.js
+++ b/static/script.js
@@ -37,6 +37,10 @@ function display_example(i) {
   }
 }
 
+function scrollTop() {
+  window.scrollTo(0, 0);
+}
+
 function spacebar() {
   if (revealed) {
     console.log("Next item");
@@ -46,5 +50,5 @@ function spacebar() {
     $("#gen_section").removeClass("gen_hidden");
     revealed = true;
   }
-  window.scrollTo(0, 0);
+  window.setTimeout(scrollTop, 1);
 }
-- 
cgit v1.2.3