Browse Source

Factor out data utils

Michael Ekstrand 2 years ago
parent
commit
d3fcbb3f35
6 changed files with 136 additions and 53 deletions
  1. 9
    9
      Cargo.lock
  2. 1
    1
      Cargo.toml
  3. 8
    43
      src/bin/parse-marc.rs
  4. 2
    0
      src/lib.rs
  5. 90
    0
      src/pgutils.rs
  6. 26
    0
      src/tsv.rs

+ 9
- 9
Cargo.lock

@@ -47,6 +47,15 @@ name = "bitflags"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
+[[package]]
+name = "bookdata"
+version = "0.1.0"
+dependencies = [
+ "flate2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quick-xml 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "structopt 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "build_const"
 version = "0.2.1"
@@ -190,15 +199,6 @@ dependencies = [
  "miniz_oxide 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
-[[package]]
-name = "ol-processing-tools"
-version = "0.1.0"
-dependencies = [
- "flate2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "quick-xml 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)",
- "structopt 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
-]
-
 [[package]]
 name = "proc-macro2"
 version = "0.4.24"

+ 1
- 1
Cargo.toml

@@ -1,5 +1,5 @@
 [package]
-name = "ol-processing-tools"
+name = "bookdata"
 version = "0.1.0"
 authors = ["Michael Ekstrand <michaelekstrand@boisestate.edu>"]
 

+ 8
- 43
src/bin/parse-marc.rs

@@ -2,6 +2,7 @@
 extern crate structopt;
 extern crate quick_xml;
 extern crate flate2;
+extern crate bookdata;
 
 use std::io::prelude::*;
 use std::io::{self, BufReader};
@@ -13,6 +14,9 @@ use quick_xml::Reader;
 use quick_xml::events::Event;
 use flate2::read::GzDecoder;
 
+use bookdata::pgutils::write_encoded;
+use bookdata::tsv::split_first;
+
 #[derive(StructOpt, Debug)]
 #[structopt(name="parse-marc")]
 struct Opt {
@@ -30,16 +34,9 @@ fn process_delim_file<R: BufRead, W: Write>(r: &mut R, w: &mut W) -> io::Result<
   let mut count = 0;
   for line in r.lines() {
     let lstr = line?;
-    match lstr.find('\t') {
-      Some(i) => {
-        let (id, xml) = lstr.split_at(i);
-        let mut parse = Reader::from_str(xml);
-        process_record(&mut parse, w, &mut count);
-      },
-      None => {
-        panic!("invalid line");
-      }
-    }
+    let (_id, xml) = split_first(&lstr).expect("invalid line");
+    let mut parse = Reader::from_str(xml);
+    process_record(&mut parse, w, &mut count);
   }
 
   Ok(count)
@@ -66,38 +63,6 @@ fn write_codes<W: Write>(w: &mut W, rno: i32, fno: i32, tag: &[u8], fld: Option<
   Ok(())
 }
 
-fn write_data<W: Write>(w: &mut W, buf: &[u8]) -> io::Result<()> {
-  let mut start = 0;
-  for i in 0..buf.len() {
-    match buf[i] {
-      b'\\' => {
-        w.write_all(&buf[start..i])?;
-        start = i + 1;
-        w.write_all(b"\\\\")?;
-      },
-      b'\r' => {
-        w.write_all(&buf[start..i])?;
-        start = i + 1;
-      },
-      b'\n' => {
-        w.write_all(&buf[start..i])?;
-        start = i + 1;
-        w.write_all(b"\\n")?;
-      },
-      b'\t' => {
-        w.write_all(&buf[start..i])?;
-        start = i + 1;
-        w.write_all(b"\\t")?;
-      },
-      _ => ()
-    }
-  }
-  if start < buf.len() {
-    w.write_all(&buf[start..])?;
-  }
-  Ok(())
-}
-
 fn write_nl<W: Write>(w: &mut W) -> io::Result<()> {
   w.write_all(b"\n")
 }
@@ -187,7 +152,7 @@ fn process_record<B: BufRead, W: Write>(rdr: &mut Reader<B>, out: &mut W, lno: &
       Ok(Event::Text(e)) => {
         if output {
           let t = e.unescaped().expect("decode error");
-          write_data(out, &t).expect("output error")
+          write_encoded(out, &t).expect("output error")
         }
       },
       Ok(Event::Eof) => break,

+ 2
- 0
src/lib.rs

@@ -0,0 +1,2 @@
+pub mod pgutils;
+pub mod tsv;

+ 90
- 0
src/pgutils.rs

@@ -0,0 +1,90 @@
+use std::io::{self, Write};
+use std::str;
+
+pub fn write_encoded<W: Write>(w: &mut W, buf: &[u8]) -> io::Result<()> {
+  let mut start = 0;
+  for i in 0..buf.len() {
+    match buf[i] {
+      b'\\' => {
+        w.write_all(&buf[start..i])?;
+        start = i + 1;
+        w.write_all(b"\\\\")?;
+      },
+      b'\r' => {
+        w.write_all(&buf[start..i])?;
+        start = i + 1;
+      },
+      b'\n' => {
+        w.write_all(&buf[start..i])?;
+        start = i + 1;
+        w.write_all(b"\\n")?;
+      },
+      b'\t' => {
+        w.write_all(&buf[start..i])?;
+        start = i + 1;
+        w.write_all(b"\\t")?;
+      },
+      _ => ()
+    }
+  }
+  if start < buf.len() {
+    w.write_all(&buf[start..])?;
+  }
+  Ok(())
+}
+
+#[test]
+fn it_writes_empty() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"").unwrap();
+
+  assert_eq!(vec.len(), 0);
+}
+
+#[test]
+fn it_writes_str() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"foo").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "foo");
+}
+
+#[test]
+fn encode_backslash() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"\\").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "\\\\");
+}
+
+#[test]
+fn encode_tab() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"\t").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "\\t");
+}
+
+#[test]
+fn encode_nl() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"\n").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "\\n");
+}
+
+#[test]
+fn skip_cr() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"\r").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "");
+}
+
+#[test]
+fn embedded() {
+  let mut vec = Vec::new();
+  write_encoded(&mut vec, b"foo\nbar\\wombat").unwrap();
+
+  assert_eq!(str::from_utf8(&vec).unwrap(), "foo\\nbar\\\\wombat");
+}

+ 26
- 0
src/tsv.rs

@@ -0,0 +1,26 @@
+pub fn split_first<'a>(line: &'a str) -> Option<(&'a str, &'a str)> {
+  match line.find('\t') {
+    Some(i) => Some((&line[0..i], &line[(i+1)..])),
+    None => None
+  }
+}
+
+#[test]
+fn split_empty() {
+  assert_eq!(split_first(""), None)
+}
+
+#[test]
+fn split_tab() {
+  assert_eq!(split_first("foo\tbar"), Some(("foo", "bar")))
+}
+
+#[test]
+fn split_end() {
+  assert_eq!(split_first("foo\t"), Some(("foo", "")))
+}
+
+#[test]
+fn split_2() {
+  assert_eq!(split_first("foo\tbar\tblatz"), Some(("foo", "bar\tblatz")))
+}