unittest: Catch missing eng.traineddata in baseapi_test

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Browse Source
Stefan Weil 2 years ago
parent
commit
ceabab8373
1 changed files with 66 additions and 31 deletions
  1. 66
    31
      unittest/baseapi_test.cc

+ 66
- 31
unittest/baseapi_test.cc

@@ -76,47 +76,59 @@ TEST_F(TesseractTest, BasicTesseractTest) {
   tesseract::TessBaseAPI api;
   std::string truth_text;
   std::string ocr_text;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
-  Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
-  CHECK(src_pix);
-  ocr_text = GetCleanedTextResult(&api, src_pix);
-  CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
-                             &truth_text, file::Defaults()));
-  absl::StripAsciiWhitespace(&truth_text);
-  EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
-  pixDestroy(&src_pix);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+    Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
+    CHECK(src_pix);
+    ocr_text = GetCleanedTextResult(&api, src_pix);
+    CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
+                               &truth_text, file::Defaults()));
+    absl::StripAsciiWhitespace(&truth_text);
+    EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
+    pixDestroy(&src_pix);
+  } else {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+  }
 }
 
 // Test that api.GetComponentImages() will return a set of images for
 // paragraphs even if text recognition was not run.
 TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
   tesseract::TessBaseAPI api;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
-  api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
-  api.SetVariable("paragraph_debug_level", "3");
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+    api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+    api.SetVariable("paragraph_debug_level", "3");
 #if 0 // TODO: b622.png is missing
-  Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
-  CHECK(src_pix);
-  api.SetImage(src_pix);
-  Boxa* para_boxes =
-      api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
-  EXPECT_TRUE(para_boxes != nullptr);
-  Boxa* block_boxes =
-      api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
-  EXPECT_TRUE(block_boxes != nullptr);
-  // TODO(eger): Get paragraphs out of this page pre-text.
-  EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
-  boxaDestroy(&block_boxes);
-  boxaDestroy(&para_boxes);
-  pixDestroy(&src_pix);
+    Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
+    CHECK(src_pix);
+    api.SetImage(src_pix);
+    Boxa* para_boxes =
+        api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
+    EXPECT_TRUE(para_boxes != nullptr);
+    Boxa* block_boxes =
+        api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
+    EXPECT_TRUE(block_boxes != nullptr);
+    // TODO(eger): Get paragraphs out of this page pre-text.
+    EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
+    boxaDestroy(&block_boxes);
+    boxaDestroy(&para_boxes);
+    pixDestroy(&src_pix);
 #endif
+  } else {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+  }
 }
 
 // We should get hOCR output and not seg fault, even if the api caller doesn't
 // call SetInputName().
 TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
   tesseract::TessBaseAPI api;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
   CHECK(src_pix);
   api.SetImage(src_pix);
@@ -131,7 +143,11 @@ TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
 // hOCR output should contain baseline info for upright textlines.
 TEST_F(TesseractTest, HOCRContainsBaseline) {
   tesseract::TessBaseAPI api;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
   CHECK(src_pix);
   api.SetInputName("HelloGoogle.tif");
@@ -151,6 +167,11 @@ TEST_F(TesseractTest, HOCRContainsBaseline) {
 // better algorithms to deal with baseline and xheight consistency.
 TEST_F(TesseractTest, RickSnyderNotFuckSnyder) {
   tesseract::TessBaseAPI api;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
 #if 0 // TODO: rick_snyder.jpeg is missing
   Pix* src_pix = pixRead(TestDataNameToPath("rick_snyder.jpeg").c_str());
@@ -161,6 +182,8 @@ TEST_F(TesseractTest, RickSnyderNotFuckSnyder) {
   EXPECT_THAT(result, Not(HasSubstr("FUCK")));
   delete[] result;
   pixDestroy(&src_pix);
+#else
+  GTEST_SKIP();
 #endif
 }
 
@@ -182,7 +205,11 @@ TEST_F(TesseractTest, AdaptToWordStrTest) {
   tesseract::TessBaseAPI api;
   std::string truth_text;
   std::string ocr_text;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
   api.SetVariable("classify_class_pruner_threshold", "220");
   // Train on the training text.
@@ -216,7 +243,11 @@ TEST_F(TesseractTest, BasicLSTMTest) {
   tesseract::TessBaseAPI api;
   std::string truth_text;
   std::string ocr_text;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
   CHECK(src_pix);
   ocr_text = GetCleanedTextResult(&api, src_pix);
@@ -240,7 +271,11 @@ TEST_F(TesseractTest, LSTMGeometryTest) {
 #else
   Pix* src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str());
   FriendlyTessBaseAPI api;
-  api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY);
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
   api.SetImage(src_pix);
   ASSERT_EQ(api.Recognize(nullptr), 0);