123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260 |
- use std::io::prelude::*;
- use std::path::{Path,PathBuf};
- use std::fs::File;
- use std::io::{BufReader, Lines};
- use std::mem::drop;
- use anyhow::{Result, anyhow};
- use regex::{Regex, Captures};
- use postgres::Connection;
- use postgres::rows::LazyRows;
- use log::*;
- use structopt::{StructOpt};
- use fallible_iterator::FallibleIterator;
- use super::Command;
- use crate::db::DbOpts;
- use crate::tsv::split_first;
- /// Parse MARC files into records for a PostgreSQL table.
- #[derive(StructOpt, Debug)]
- #[structopt(name="parse-isbns")]
- pub struct ParseISBNs {
- #[structopt(flatten)]
- db: DbOpts,
- /// The table from which to parse ISBNs.
- #[structopt(short="-s", long="src-table")]
- src_table: Option<String>,
- /// The file from which to parse ISBNs.
- #[structopt(short="-f", long="src-file")]
- src_file: Option<PathBuf>,
- /// Print unmatched entries
- #[structopt(short="-U", long="print-unmatched")]
- print_unmatched: bool,
- /// Print ignored entries
- #[structopt(short="-I", long="print-ignored")]
- print_ignored: bool
- }
- #[derive(Debug)]
- struct ISBN {
- text: String,
- descriptor: Option<String>
- }
- fn parse_plain<'t>(m: &Captures<'t>) -> ISBN {
- let isbn = m.get(1).unwrap().as_str();
- let desc = m.get(2).map(|m| m.as_str().to_owned());
- ISBN {
- text: isbn.to_owned(),
- descriptor: desc
- }
- }
- static PARSERS: &'static [(&'static str, fn(&Captures) -> ISBN)] = &[
- ("^(?:[a-z][[:space:]]+|\\([[:digit:]]+\\)[[:space:]]+|\\*)?([0-9 -][Xx]?)(?:[[:space:]]*\\((.+?)\\))?",
- parse_plain)
- ];
- static IGNORES: &'static [&'static str] = &[
- "^[$£][[:digit:]., ]+$"
- ];
- struct IsbnParser {
- parsers: Vec<(Regex, &'static fn(&Captures) -> ISBN)>,
- ignores: Vec<Regex>
- }
- #[derive(Debug)]
- enum ParseResult {
- Valid(ISBN),
- Ignored(String),
- Unmatched(String)
- }
- impl IsbnParser {
- fn compile() -> Result<IsbnParser> {
- let mut compiled = Vec::with_capacity(PARSERS.len());
- for (pat, func) in PARSERS {
- let rex = Regex::new(pat)?;
- compiled.push((rex, func));
- }
- let mut comp_ignore = Vec::with_capacity(IGNORES.len());
- for pat in IGNORES {
- comp_ignore.push(Regex::new(pat)?);
- }
- Ok(IsbnParser {
- parsers: compiled,
- ignores: comp_ignore
- })
- }
- fn parse<'a>(&self, text: &'a str) -> ParseResult {
- for (rex, func) in &self.parsers {
- if let Some(cap) = rex.captures(text) {
- return ParseResult::Valid(func(&cap))
- }
- }
- for rex in &self.ignores {
- if rex.is_match(text) {
- return ParseResult::Ignored(text.to_owned())
- }
- }
- ParseResult::Unmatched(text.to_owned())
- }
- }
- type IdPR = (i64, ParseResult);
- struct FileSource<B> {
- parsers: IsbnParser,
- lines: Lines<B>
- }
- impl <B: BufRead> FallibleIterator for FileSource<B> {
- type Item = IdPR;
- type Error = anyhow::Error;
- fn next(&mut self) -> Result<Option<IdPR>> {
- let nl = self.lines.next();
- match nl {
- None => Ok(None),
- Some(line) => {
- let text = line?;
- let (id, isbn) = split_first(&text).unwrap();
- Ok(Some((id.parse::<i64>()?, self.parsers.parse(isbn))))
- }
- }
- }
- }
- struct DBSource<'t, 's> {
- parsers: IsbnParser,
- rows: LazyRows<'t, 's>
- }
- impl <'t, 's> DBSource<'t, 's> {
- fn create(rows: LazyRows<'t, 's>) -> Result<DBSource<'t, 's>> {
- Ok(DBSource {
- parsers: IsbnParser::compile()?,
- rows: rows
- })
- }
- }
- impl <'t, 's> FallibleIterator for DBSource<'t, 's> {
- type Item = IdPR;
- type Error = anyhow::Error;
- fn next(&mut self) -> Result<Option<IdPR>> {
- if let Some(row) = self.rows.next()? {
- let id: i32 = row.get(0);
- let content: String = row.get(1);
- let result = self.parsers.parse(&content);
- debug!("{}: {:?}", id, result);
- Ok(Some((id.into(), result)))
- } else {
- Ok(None)
- }
- }
- }
- struct MatchStats {
- total: u64,
- valid: u64,
- ignored: u64,
- unmatched: u64
- }
- impl Default for MatchStats {
- fn default() -> MatchStats {
- MatchStats {
- total: 0,
- valid: 0,
- ignored: 0,
- unmatched: 0
- }
- }
- }
- impl ParseISBNs {
- fn scan_source<R>(&self, iter: &mut R) -> Result<MatchStats>
- where R: FallibleIterator<Item = IdPR, Error = anyhow::Error> {
- let mut stats = MatchStats::default();
- while let Some((id, result)) = iter.next()? {
- debug!("{}: {:?}", id, result);
- match result {
- ParseResult::Valid(isbn) => {
- stats.valid += 1;
- },
- ParseResult::Ignored (s)=> {
- stats.ignored += 1;
- if self.print_ignored {
- println!("ignored {}: {}", id, s)
- }
- },
- ParseResult::Unmatched(s) => {
- stats.unmatched += 1;
- if self.print_unmatched {
- println!("unmatched {}: {}", id, s);
- }
- }
- }
- stats.total += 1;
- }
- Ok(stats)
- }
- fn scan_file(&self, file: &Path) -> Result<MatchStats> {
- let parsers = IsbnParser::compile()?;
- let input = File::open(file)?;
- let input = BufReader::new(input);
- let mut src = FileSource {
- parsers: parsers,
- lines: input.lines()
- };
- self.scan_source(&mut src)
- }
- fn scan_db(&self, db: &Connection, table: &str) -> Result<MatchStats> {
- let query = format!("SELECT * FROM {}", table);
- let txn = db.transaction()?;
- let stmt = txn.prepare(&query)?;
- let rows = stmt.lazy_query(&txn, &[], 1000)?;
- let mut src = DBSource::create(rows)?;
- let stats = self.scan_source(&mut src)?;
- drop(src);
- drop(stmt);
- txn.commit()?;
- Ok(stats)
- }
- }
- impl Command for ParseISBNs {
- fn exec(self) -> Result<()> {
- let stats = if let Some(ref tbl) = self.src_table {
- let db = self.db.open()?;
- let n = self.scan_db(&db, tbl)?;
- n
- } else if let Some(ref path) = self.src_file {
- self.scan_file(&path)?
- } else {
- error!("no source data specified");
- return Err(anyhow!("no source data"));
- };
- info!("processed {} ISBN records", stats.total);
- info!("matched {}, ignored {}, and {} were unmatched",
- stats.valid, stats.ignored, stats.unmatched);
- Ok(())
- }
- }
|