parse_isbns.rs 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. use std::io::prelude::*;
  2. use std::path::{Path,PathBuf};
  3. use std::fs::File;
  4. use std::io::{BufReader, Lines};
  5. use std::mem::drop;
  6. use anyhow::{Result, anyhow};
  7. use regex::{Regex, Captures};
  8. use postgres::Connection;
  9. use postgres::rows::LazyRows;
  10. use log::*;
  11. use structopt::{StructOpt};
  12. use fallible_iterator::FallibleIterator;
  13. use super::Command;
  14. use crate::db::DbOpts;
  15. use crate::tsv::split_first;
  16. /// Parse MARC files into records for a PostgreSQL table.
  17. #[derive(StructOpt, Debug)]
  18. #[structopt(name="parse-isbns")]
  19. pub struct ParseISBNs {
  20. #[structopt(flatten)]
  21. db: DbOpts,
  22. /// The table from which to parse ISBNs.
  23. #[structopt(short="-s", long="src-table")]
  24. src_table: Option<String>,
  25. /// The file from which to parse ISBNs.
  26. #[structopt(short="-f", long="src-file")]
  27. src_file: Option<PathBuf>,
  28. /// Print unmatched entries
  29. #[structopt(short="-U", long="print-unmatched")]
  30. print_unmatched: bool,
  31. /// Print ignored entries
  32. #[structopt(short="-I", long="print-ignored")]
  33. print_ignored: bool
  34. }
  35. #[derive(Debug)]
  36. struct ISBN {
  37. text: String,
  38. descriptor: Option<String>
  39. }
  40. fn parse_plain<'t>(m: &Captures<'t>) -> ISBN {
  41. let isbn = m.get(1).unwrap().as_str();
  42. let desc = m.get(2).map(|m| m.as_str().to_owned());
  43. ISBN {
  44. text: isbn.to_owned(),
  45. descriptor: desc
  46. }
  47. }
  48. static PARSERS: &'static [(&'static str, fn(&Captures) -> ISBN)] = &[
  49. ("^(?:[a-z][[:space:]]+|\\([[:digit:]]+\\)[[:space:]]+|\\*)?([0-9 -][Xx]?)(?:[[:space:]]*\\((.+?)\\))?",
  50. parse_plain)
  51. ];
  52. static IGNORES: &'static [&'static str] = &[
  53. "^[$£][[:digit:]., ]+$"
  54. ];
  55. struct IsbnParser {
  56. parsers: Vec<(Regex, &'static fn(&Captures) -> ISBN)>,
  57. ignores: Vec<Regex>
  58. }
  59. #[derive(Debug)]
  60. enum ParseResult {
  61. Valid(ISBN),
  62. Ignored(String),
  63. Unmatched(String)
  64. }
  65. impl IsbnParser {
  66. fn compile() -> Result<IsbnParser> {
  67. let mut compiled = Vec::with_capacity(PARSERS.len());
  68. for (pat, func) in PARSERS {
  69. let rex = Regex::new(pat)?;
  70. compiled.push((rex, func));
  71. }
  72. let mut comp_ignore = Vec::with_capacity(IGNORES.len());
  73. for pat in IGNORES {
  74. comp_ignore.push(Regex::new(pat)?);
  75. }
  76. Ok(IsbnParser {
  77. parsers: compiled,
  78. ignores: comp_ignore
  79. })
  80. }
  81. fn parse<'a>(&self, text: &'a str) -> ParseResult {
  82. for (rex, func) in &self.parsers {
  83. if let Some(cap) = rex.captures(text) {
  84. return ParseResult::Valid(func(&cap))
  85. }
  86. }
  87. for rex in &self.ignores {
  88. if rex.is_match(text) {
  89. return ParseResult::Ignored(text.to_owned())
  90. }
  91. }
  92. ParseResult::Unmatched(text.to_owned())
  93. }
  94. }
  95. type IdPR = (i64, ParseResult);
  96. struct FileSource<B> {
  97. parsers: IsbnParser,
  98. lines: Lines<B>
  99. }
  100. impl <B: BufRead> FallibleIterator for FileSource<B> {
  101. type Item = IdPR;
  102. type Error = anyhow::Error;
  103. fn next(&mut self) -> Result<Option<IdPR>> {
  104. let nl = self.lines.next();
  105. match nl {
  106. None => Ok(None),
  107. Some(line) => {
  108. let text = line?;
  109. let (id, isbn) = split_first(&text).unwrap();
  110. Ok(Some((id.parse::<i64>()?, self.parsers.parse(isbn))))
  111. }
  112. }
  113. }
  114. }
  115. struct DBSource<'t, 's> {
  116. parsers: IsbnParser,
  117. rows: LazyRows<'t, 's>
  118. }
  119. impl <'t, 's> DBSource<'t, 's> {
  120. fn create(rows: LazyRows<'t, 's>) -> Result<DBSource<'t, 's>> {
  121. Ok(DBSource {
  122. parsers: IsbnParser::compile()?,
  123. rows: rows
  124. })
  125. }
  126. }
  127. impl <'t, 's> FallibleIterator for DBSource<'t, 's> {
  128. type Item = IdPR;
  129. type Error = anyhow::Error;
  130. fn next(&mut self) -> Result<Option<IdPR>> {
  131. if let Some(row) = self.rows.next()? {
  132. let id: i32 = row.get(0);
  133. let content: String = row.get(1);
  134. let result = self.parsers.parse(&content);
  135. debug!("{}: {:?}", id, result);
  136. Ok(Some((id.into(), result)))
  137. } else {
  138. Ok(None)
  139. }
  140. }
  141. }
  142. struct MatchStats {
  143. total: u64,
  144. valid: u64,
  145. ignored: u64,
  146. unmatched: u64
  147. }
  148. impl Default for MatchStats {
  149. fn default() -> MatchStats {
  150. MatchStats {
  151. total: 0,
  152. valid: 0,
  153. ignored: 0,
  154. unmatched: 0
  155. }
  156. }
  157. }
  158. impl ParseISBNs {
  159. fn scan_source<R>(&self, iter: &mut R) -> Result<MatchStats>
  160. where R: FallibleIterator<Item = IdPR, Error = anyhow::Error> {
  161. let mut stats = MatchStats::default();
  162. while let Some((id, result)) = iter.next()? {
  163. debug!("{}: {:?}", id, result);
  164. match result {
  165. ParseResult::Valid(isbn) => {
  166. stats.valid += 1;
  167. },
  168. ParseResult::Ignored (s)=> {
  169. stats.ignored += 1;
  170. if self.print_ignored {
  171. println!("ignored {}: {}", id, s)
  172. }
  173. },
  174. ParseResult::Unmatched(s) => {
  175. stats.unmatched += 1;
  176. if self.print_unmatched {
  177. println!("unmatched {}: {}", id, s);
  178. }
  179. }
  180. }
  181. stats.total += 1;
  182. }
  183. Ok(stats)
  184. }
  185. fn scan_file(&self, file: &Path) -> Result<MatchStats> {
  186. let parsers = IsbnParser::compile()?;
  187. let input = File::open(file)?;
  188. let input = BufReader::new(input);
  189. let mut src = FileSource {
  190. parsers: parsers,
  191. lines: input.lines()
  192. };
  193. self.scan_source(&mut src)
  194. }
  195. fn scan_db(&self, db: &Connection, table: &str) -> Result<MatchStats> {
  196. let query = format!("SELECT * FROM {}", table);
  197. let txn = db.transaction()?;
  198. let stmt = txn.prepare(&query)?;
  199. let rows = stmt.lazy_query(&txn, &[], 1000)?;
  200. let mut src = DBSource::create(rows)?;
  201. let stats = self.scan_source(&mut src)?;
  202. drop(src);
  203. drop(stmt);
  204. txn.commit()?;
  205. Ok(stats)
  206. }
  207. }
  208. impl Command for ParseISBNs {
  209. fn exec(self) -> Result<()> {
  210. let stats = if let Some(ref tbl) = self.src_table {
  211. let db = self.db.open()?;
  212. let n = self.scan_db(&db, tbl)?;
  213. n
  214. } else if let Some(ref path) = self.src_file {
  215. self.scan_file(&path)?
  216. } else {
  217. error!("no source data specified");
  218. return Err(anyhow!("no source data"));
  219. };
  220. info!("processed {} ISBN records", stats.total);
  221. info!("matched {}, ignored {}, and {} were unmatched",
  222. stats.valid, stats.ignored, stats.unmatched);
  223. Ok(())
  224. }
  225. }
Tip!

Press p or to see the previous file or, n or to see the next file