parsers.rs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. use anyhow::Result;
  2. use regex::{Regex, RegexSet, Match, Captures};
  3. /// Single ISBN parsed from a string
  4. #[derive(Debug, PartialEq)]
  5. pub struct ISBN {
  6. pub text: String,
  7. pub tags: Vec<String>
  8. }
  9. /// Result of parsing an ISBN string
  10. #[derive(Debug, PartialEq)]
  11. pub enum ParseResult {
  12. Valid(Vec<ISBN>, String),
  13. Ignored(String),
  14. Unmatched(String)
  15. }
  16. /// Regular expressions for unparsable ISBN strings to ignore.
  17. /// This cleans up warning displays.
  18. static IGNORES: &'static [&'static str] = &[
  19. r"^[$]?[[:digit:]., ]+(?:[a-zA-Z*]{1,4})?(\s+\(.*?\))?$",
  20. r"^[[:digit:].]+(/[[:digit:].]+)+$",
  21. r"^[A-Z]-[A-Z]-\d{8,}",
  22. r"^\s*$"
  23. ];
  24. /// Definitions for parsing ISBN strings.
  25. pub struct ParserDefs {
  26. lead: Regex,
  27. isbn: Regex,
  28. tag: Regex,
  29. tag_sep: Regex,
  30. tail_skip: Regex,
  31. clean: Regex,
  32. unmatch_ignore: RegexSet
  33. }
  34. impl ParserDefs {
  35. pub fn new() -> ParserDefs {
  36. fn cre(p: &str) -> Regex {
  37. // we use unwrap instead of result since regex compile failure is a programming error
  38. Regex::new(p).unwrap()
  39. }
  40. ParserDefs {
  41. lead: cre(r"^[;.]?\s*(?:[a-z]\s+|\(\d+\)\s+|\*|ISBN\s+)?"),
  42. isbn: cre(r"^([\p{Nonspacing Mark}0-9-]{8,}[Xx]?|[0-9]{1,5}(?:[a-zA-Z]+|[ +])[0-9-]{4,})"),
  43. tag: cre(r"^\s*[(\[](.+?)[)\]]"),
  44. tag_sep: cre(r"\s*:\s*"),
  45. tail_skip: cre(r"^\s*[;:/.]?"),
  46. clean: cre(r"[\p{Nonspacing Mark}a-wyzA-WYZ -]"),
  47. unmatch_ignore: RegexSet::new(IGNORES).unwrap()
  48. }
  49. }
  50. /// Create a new parser to parse a string.
  51. pub fn create_parser<'p, 's>(&'p self, s: &'s str) -> IsbnParser<'p, 's> {
  52. IsbnParser {
  53. defs: self,
  54. string: s,
  55. position: 0
  56. }
  57. }
  58. /// Parse a string
  59. pub fn parse(&self, s: &str) -> ParseResult {
  60. let mut parser = self.create_parser(s);
  61. parser.read_all()
  62. }
  63. }
  64. pub struct IsbnParser<'p, 's> {
  65. defs: &'p ParserDefs,
  66. string: &'s str,
  67. position: usize
  68. }
  69. fn preclean(s: &str) -> String {
  70. let mut res = String::with_capacity(s.len());
  71. for c in s.chars() {
  72. if c.is_ascii() {
  73. res.push(c);
  74. }
  75. }
  76. res
  77. }
  78. impl <'p, 's> IsbnParser<'p, 's> {
  79. /// Get the remaining (unparsed) text from the parser
  80. fn remaining(&self) -> &'s str {
  81. &self.string[self.position..]
  82. }
  83. /// Advance the parse position to the end of a regex patch, if possible.
  84. fn eat(&mut self, rex: &Regex) -> bool {
  85. let slice = self.remaining();
  86. if let Some(m) = rex.find(slice) {
  87. self.advance(m.end());
  88. true
  89. } else {
  90. false
  91. }
  92. }
  93. /// See if a regex matches.
  94. fn peek(&self, rex: &Regex) -> Option<Match> {
  95. let slice = self.remaining();
  96. rex.find(slice)
  97. }
  98. /// See if a regex matches, and advance if it does.
  99. fn read(&mut self, rex: &Regex) -> Option<Match<'s>> {
  100. let slice = self.remaining();
  101. let res = rex.find(slice);
  102. if let Some(m) = res {
  103. self.advance(m.end());
  104. }
  105. res
  106. }
  107. /// Read with capture groups
  108. fn read_cap(&mut self, rex: &Regex) -> Option<Captures<'s>> {
  109. let slice = self.remaining();
  110. let res = rex.captures(slice);
  111. if let Some(ref m) = res {
  112. self.advance(m.get(0).unwrap().end());
  113. }
  114. res
  115. }
  116. /// Advance the parse position by `n` characters.
  117. fn advance(&mut self, n: usize) {
  118. self.position += n;
  119. }
  120. fn is_empty(&self) -> bool {
  121. self.position == self.string.len()
  122. }
  123. /// Read a single ISBN
  124. fn read_isbn(&mut self) -> Option<ISBN> {
  125. self.eat(&self.defs.lead);
  126. self.read(&self.defs.isbn).map(|m| ISBN {
  127. text: self.defs.clean.replace_all(m.as_str(), "").to_string(),
  128. tags: self.read_tags()
  129. })
  130. }
  131. /// Read tags (assuming an ISBN has just been read)
  132. fn read_tags(&mut self) -> Vec<String> {
  133. let mut tags = Vec::new();
  134. while let Some(m) = self.read_cap(&self.defs.tag) {
  135. let tag = m.get(1).unwrap().as_str();
  136. for t in self.defs.tag_sep.split(tag) {
  137. tags.push(t.to_owned());
  138. }
  139. }
  140. tags
  141. }
  142. /// Read all ISBNs
  143. fn read_all(&mut self) -> ParseResult {
  144. let mut isbns = Vec::new();
  145. while let Some(res) = self.read_isbn() {
  146. isbns.push(res);
  147. // advance through our skip
  148. self.eat(&self.defs.tail_skip);
  149. }
  150. if isbns.is_empty() {
  151. if self.defs.unmatch_ignore.is_match(self.string) {
  152. ParseResult::Ignored(self.string.to_owned())
  153. } else {
  154. ParseResult::Unmatched(self.string.to_owned())
  155. }
  156. } else {
  157. ParseResult::Valid(isbns, self.remaining().to_owned())
  158. }
  159. }
  160. }
  161. #[test]
  162. fn test_preclean_keep() {
  163. assert_eq!(preclean("foo").as_str(), "foo");
  164. }
  165. #[test]
  166. fn test_preclean_caron() {
  167. let src = "349̌224010X";
  168. let isbn = "349224010X";
  169. assert_eq!(preclean(src).as_str(), isbn);
  170. }
  171. #[test]
  172. fn test_parser_initial() {
  173. let defs = ParserDefs::new();
  174. let target = "jimbob";
  175. let parser = defs.create_parser(target);
  176. assert_eq!(parser.position, 0);
  177. assert_eq!(parser.string, target);
  178. assert_eq!(parser.remaining(), target);
  179. }
  180. #[test]
  181. fn test_eat_nomatch() {
  182. let defs = ParserDefs::new();
  183. let target = "jimbob";
  184. let pat = Regex::new(r"^\d").unwrap();
  185. let mut parser = defs.create_parser(target);
  186. assert!(!parser.eat(&pat));
  187. assert_eq!(parser.position, 0);
  188. }
  189. #[test]
  190. fn test_eat_match() {
  191. let defs = ParserDefs::new();
  192. let target = "jimbob";
  193. let pat = Regex::new(r"^jim").unwrap();
  194. let mut parser = defs.create_parser(target);
  195. assert!(parser.eat(&pat));
  196. assert_eq!(parser.position, 3);
  197. assert!(!parser.is_empty());
  198. assert_eq!(parser.remaining(), "bob");
  199. }
  200. #[test]
  201. fn test_eat_later() {
  202. let defs = ParserDefs::new();
  203. let target = "jimjim";
  204. let pat = Regex::new(r"^jim").unwrap();
  205. let mut parser = defs.create_parser(target);
  206. assert!(parser.eat(&pat));
  207. assert_eq!(parser.position, 3);
  208. assert!(parser.eat(&pat));
  209. assert_eq!(parser.position, 6);
  210. assert!(parser.is_empty());
  211. // eating again fails
  212. assert!(!parser.eat(&pat));
  213. assert_eq!(parser.remaining(), "");
  214. }
  215. #[test]
  216. fn test_scan_empty() {
  217. let defs = ParserDefs::new();
  218. let mut parser = defs.create_parser("");
  219. assert_eq!(parser.read_isbn(), None);
  220. }
  221. #[test]
  222. fn test_parse_empty() {
  223. let defs = ParserDefs::new();
  224. let res = defs.parse("");
  225. assert_eq!(res, ParseResult::Ignored("".to_owned()));
  226. }
  227. #[test]
  228. fn test_scan_ws() {
  229. let defs = ParserDefs::new();
  230. let mut parser = defs.create_parser(" ");
  231. assert_eq!(parser.read_isbn(), None);
  232. }
  233. #[test]
  234. fn test_parse_ws() {
  235. let defs = ParserDefs::new();
  236. let res = defs.parse(" ");
  237. assert_eq!(res, ParseResult::Ignored(" ".to_owned()));
  238. }
  239. #[test]
  240. fn test_parse_isbn() {
  241. let isbn = "349224010X";
  242. let defs = ParserDefs::new();
  243. let mut parser = defs.create_parser(isbn);
  244. let scan = parser.read_isbn();
  245. assert!(scan.is_some());
  246. let scan = scan.unwrap();
  247. assert_eq!(scan.text, isbn);
  248. assert_eq!(scan.tags.len(), 0);
  249. assert_eq!(parser.position, isbn.len());
  250. assert!(parser.is_empty());
  251. let res = defs.parse(isbn);
  252. match res {
  253. ParseResult::Valid(isbns, trail) => {
  254. assert_eq!(isbns.len(), 1);
  255. assert_eq!(isbns[0].text, isbn);
  256. assert_eq!(isbns[0].tags.len(), 0);
  257. assert_eq!(trail, "");
  258. },
  259. x => panic!("bad parse: {:?}", x)
  260. }
  261. }
  262. #[test]
  263. fn test_parse_isbn_trail() {
  264. let src = "349224010X :";
  265. let isbn = "349224010X";
  266. let defs = ParserDefs::new();
  267. let res = defs.parse(src);
  268. match res {
  269. ParseResult::Valid(isbns, trail) => {
  270. assert_eq!(isbns.len(), 1);
  271. assert_eq!(isbns[0].text, isbn);
  272. assert_eq!(isbns[0].tags.len(), 0);
  273. assert_eq!(trail, "");
  274. },
  275. x => panic!("bad parse: {:?}", x)
  276. }
  277. }
  278. #[test]
  279. fn test_scan_caron() {
  280. // this string has a combining mark (caron, unicode 730) in it
  281. let src = "349̌224010X";
  282. // we want a cleaned ISBN
  283. let isbn = "349224010X";
  284. let defs = ParserDefs::new();
  285. let mut parser = defs.create_parser(src);
  286. let res = parser.read_isbn().unwrap();
  287. assert_eq!(res.text, isbn);
  288. }
  289. #[test]
  290. fn test_parse_isbn_caron() {
  291. let src = "349̌224010X";
  292. let isbn = "349224010X";
  293. let defs = ParserDefs::new();
  294. let res = defs.parse(src);
  295. match res {
  296. ParseResult::Valid(isbns, trail) => {
  297. assert_eq!(isbns.len(), 1);
  298. assert_eq!(isbns[0].text, isbn);
  299. assert_eq!(isbns[0].tags.len(), 0);
  300. assert_eq!(trail, "");
  301. },
  302. x => panic!("bad parse: {:?}", x)
  303. }
  304. }
  305. #[test]
  306. fn test_parse_hyphen_isbn() {
  307. let src = "978-03-2948-9391";
  308. let isbn = "9780329489391";
  309. let defs = ParserDefs::new();
  310. let mut parser = defs.create_parser(src);
  311. let scan = parser.read_isbn();
  312. assert!(scan.is_some());
  313. let scan = scan.unwrap();
  314. assert_eq!(scan.text, isbn);
  315. assert_eq!(scan.tags.len(), 0);
  316. assert!(parser.is_empty());
  317. let res = defs.parse(src);
  318. match res {
  319. ParseResult::Valid(isbns, trail) => {
  320. assert_eq!(isbns.len(), 1);
  321. assert_eq!(isbns[0].text, isbn);
  322. assert_eq!(isbns[0].tags.len(), 0);
  323. assert_eq!(trail, "");
  324. },
  325. x => panic!("bad parse: {:?}", x)
  326. }
  327. }
  328. #[test]
  329. fn test_parse_space_isbn() {
  330. let src = "978 032948-9391";
  331. let isbn = "9780329489391";
  332. let defs = ParserDefs::new();
  333. let mut parser = defs.create_parser(src);
  334. let scan = parser.read_isbn();
  335. assert!(scan.is_some());
  336. let scan = scan.unwrap();
  337. assert_eq!(scan.text, isbn);
  338. assert_eq!(scan.tags.len(), 0);
  339. assert!(parser.is_empty());
  340. let res = defs.parse(src);
  341. match res {
  342. ParseResult::Valid(isbns, trail) => {
  343. assert_eq!(isbns.len(), 1);
  344. assert_eq!(isbns[0].text, isbn);
  345. assert_eq!(isbns[0].tags.len(), 0);
  346. assert_eq!(trail, "");
  347. },
  348. x => panic!("bad parse: {:?}", x)
  349. }
  350. }
  351. #[test]
  352. fn test_parse_isbn_tag() {
  353. let src = "34922401038 (set)";
  354. let isbn = "34922401038";
  355. let tag = "set";
  356. let defs = ParserDefs::new();
  357. let mut parser = defs.create_parser(src);
  358. let scan = parser.read_isbn();
  359. assert!(scan.is_some());
  360. let scan = scan.unwrap();
  361. assert_eq!(scan.text, isbn);
  362. assert_eq!(scan.tags, vec![tag]);
  363. assert!(parser.is_empty());
  364. let res = defs.parse(src);
  365. match res {
  366. ParseResult::Valid(isbns, trail) => {
  367. assert_eq!(isbns.len(), 1);
  368. assert_eq!(isbns[0].text, isbn);
  369. assert_eq!(isbns[0].tags, vec![tag]);
  370. assert_eq!(trail, "");
  371. },
  372. x => panic!("bad parse: {:?}", x)
  373. }
  374. }
  375. #[test]
  376. fn test_parse_isbn_square_tag() {
  377. let src = "34922401038 [set]";
  378. let isbn = "34922401038";
  379. let tag = "set";
  380. let defs = ParserDefs::new();
  381. let mut parser = defs.create_parser(src);
  382. let scan = parser.read_isbn();
  383. assert!(scan.is_some());
  384. let scan = scan.unwrap();
  385. assert_eq!(scan.text, isbn);
  386. assert_eq!(scan.tags, vec![tag]);
  387. assert!(parser.is_empty());
  388. let res = defs.parse(src);
  389. match res {
  390. ParseResult::Valid(isbns, trail) => {
  391. assert_eq!(isbns.len(), 1);
  392. assert_eq!(isbns[0].text, isbn);
  393. assert_eq!(isbns[0].tags, vec![tag]);
  394. assert_eq!(trail, "");
  395. },
  396. x => panic!("bad parse: {:?}", x)
  397. }
  398. }
  399. #[test]
  400. fn test_parse_isbn_multi_tag_sep() {
  401. let src = "34922401038 (set : alk. paper)";
  402. let isbn = "34922401038";
  403. let defs = ParserDefs::new();
  404. let mut parser = defs.create_parser(src);
  405. let scan = parser.read_isbn();
  406. assert!(scan.is_some());
  407. let scan = scan.unwrap();
  408. assert_eq!(scan.text, isbn);
  409. assert_eq!(scan.tags, vec!["set", "alk. paper"]);
  410. assert!(parser.is_empty());
  411. let res = defs.parse(src);
  412. match res {
  413. ParseResult::Valid(isbns, trail) => {
  414. assert_eq!(isbns.len(), 1);
  415. assert_eq!(isbns[0].text, isbn);
  416. assert_eq!(isbns[0].tags, vec!["set", "alk. paper"]);
  417. assert_eq!(trail, "");
  418. },
  419. x => panic!("bad parse: {:?}", x)
  420. }
  421. }
  422. #[test]
  423. fn test_parse_isbn_tags() {
  424. let src = "34922401038 (pbk.) (set)";
  425. let isbn = "34922401038";
  426. let defs = ParserDefs::new();
  427. let mut parser = defs.create_parser(src);
  428. let scan = parser.read_isbn();
  429. assert!(scan.is_some());
  430. let scan = scan.unwrap();
  431. assert_eq!(scan.text, isbn);
  432. assert_eq!(scan.tags, vec!["pbk.", "set"]);
  433. assert!(parser.is_empty());
  434. let res = defs.parse(src);
  435. match res {
  436. ParseResult::Valid(isbns, trail) => {
  437. assert_eq!(isbns.len(), 1);
  438. assert_eq!(isbns[0].text, isbn);
  439. assert_eq!(isbns[0].tags, vec!["pbk.", "set"]);
  440. assert_eq!(trail, "");
  441. },
  442. x => panic!("bad parse: {:?}", x)
  443. }
  444. }
  445. #[test]
  446. fn test_parse_isbn_leader() {
  447. let src = "a 970238408138";
  448. let isbn = "970238408138";
  449. let defs = ParserDefs::new();
  450. let mut parser = defs.create_parser(src);
  451. let scan = parser.read_isbn();
  452. assert!(scan.is_some());
  453. let scan = scan.unwrap();
  454. assert_eq!(scan.text, isbn);
  455. assert_eq!(scan.tags.len(), 0);
  456. assert!(parser.is_empty());
  457. let res = defs.parse(src);
  458. match res {
  459. ParseResult::Valid(isbns, trail) => {
  460. assert_eq!(isbns.len(), 1);
  461. assert_eq!(isbns[0].text, isbn);
  462. assert_eq!(isbns[0].tags.len(), 0);
  463. assert_eq!(trail, "");
  464. },
  465. x => panic!("bad parse: {:?}", x)
  466. }
  467. }
  468. #[test]
  469. fn test_parse_two_isbns_ws() {
  470. let src = "970238408138 30148100103";
  471. let isbn1 = "970238408138";
  472. let isbn2 = "30148100103";
  473. let defs = ParserDefs::new();
  474. let mut parser = defs.create_parser(src);
  475. let scan = parser.read_isbn();
  476. assert!(scan.is_some());
  477. let scan = scan.unwrap();
  478. assert_eq!(scan.text, isbn1);
  479. assert_eq!(scan.tags.len(), 0);
  480. assert_eq!(parser.position, isbn1.len());
  481. let res = defs.parse(src);
  482. match res {
  483. ParseResult::Valid(isbns, trail) => {
  484. assert_eq!(isbns.len(), 2);
  485. assert_eq!(isbns[0].text, isbn1);
  486. assert_eq!(isbns[0].tags.len(), 0);
  487. assert_eq!(isbns[1].text, isbn2);
  488. assert_eq!(isbns[1].tags.len(), 0);
  489. assert_eq!(trail, "");
  490. },
  491. x => panic!("bad parse: {:?}", x)
  492. }
  493. }
  494. #[test]
  495. fn test_parse_two_isbns_semi() {
  496. let src = "970238408138; ISBN 30148100103";
  497. let isbn1 = "970238408138";
  498. let isbn2 = "30148100103";
  499. let defs = ParserDefs::new();
  500. let mut parser = defs.create_parser(src);
  501. let scan = parser.read_isbn();
  502. assert!(scan.is_some());
  503. let scan = scan.unwrap();
  504. assert_eq!(scan.text, isbn1);
  505. assert_eq!(scan.tags.len(), 0);
  506. assert_eq!(parser.position, isbn1.len());
  507. let res = defs.parse(src);
  508. match res {
  509. ParseResult::Valid(isbns, trail) => {
  510. assert_eq!(isbns.len(), 2);
  511. assert_eq!(isbns[0].text, isbn1);
  512. assert_eq!(isbns[0].tags.len(), 0);
  513. assert_eq!(isbns[1].text, isbn2);
  514. assert_eq!(isbns[1].tags.len(), 0);
  515. assert_eq!(trail, "");
  516. },
  517. x => panic!("bad parse: {:?}", x)
  518. }
  519. }
  520. #[test]
  521. fn test_parse_two_isbns_real() {
  522. // Real example from record 2175696
  523. let src = "8719359022. ISBN 8719359004 (pbk.)";
  524. let isbn1 = "8719359022";
  525. let isbn2 = "8719359004";
  526. let defs = ParserDefs::new();
  527. let mut parser = defs.create_parser(src);
  528. let scan = parser.read_isbn();
  529. assert!(scan.is_some());
  530. let scan = scan.unwrap();
  531. assert_eq!(scan.text, isbn1);
  532. assert_eq!(scan.tags.len(), 0);
  533. let res = defs.parse(src);
  534. match res {
  535. ParseResult::Valid(isbns, trail) => {
  536. assert_eq!(isbns.len(), 2);
  537. assert_eq!(isbns[0].text, isbn1);
  538. assert_eq!(isbns[0].tags.len(), 0);
  539. assert_eq!(isbns[1].text, isbn2);
  540. assert_eq!(isbns[1].tags, vec!["pbk."]);
  541. assert_eq!(trail, "");
  542. },
  543. x => panic!("bad parse: {:?}", x)
  544. }
  545. }
Tip!

Press p or to see the previous file or, n or to see the next file