baseapi.cpp 92 KB


  1. /**********************************************************************
  2. * File: baseapi.cpp
  3. * Description: Simple API for calling tesseract.
  4. * Author: Ray Smith
  5. *
  6. * (C) Copyright 2006, Google Inc.
  7. ** Licensed under the Apache License, Version 2.0 (the "License");
  8. ** you may not use this file except in compliance with the License.
  9. ** You may obtain a copy of the License at
  10. ** http://www.apache.org/licenses/LICENSE-2.0
  11. ** Unless required by applicable law or agreed to in writing, software
  12. ** distributed under the License is distributed on an "AS IS" BASIS,
  13. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. ** See the License for the specific language governing permissions and
  15. ** limitations under the License.
  16. *
  17. **********************************************************************/
  18. #define _USE_MATH_DEFINES // for M_PI
  19. // Include automatically generated configuration file if running autoconf.
  20. #ifdef HAVE_CONFIG_H
  21. #include "config_auto.h"
  22. #endif
  23. #include <tesseract/baseapi.h>
  24. #ifdef __linux__
  25. #include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
  26. #endif
  27. #if defined(_WIN32)
  28. #include <fcntl.h>
  29. #include <io.h>
  30. #else
  31. #include <dirent.h> // for closedir, opendir, readdir, DIR, dirent
  32. #include <libgen.h>
  33. #include <sys/types.h>
  34. #include <sys/stat.h> // for stat, S_IFDIR
  35. #include <unistd.h>
  36. #endif // _WIN32
  37. #include <cmath> // for round, M_PI
  38. #include <cstdint> // for int32_t
  39. #include <cstring> // for strcmp, strcpy
  40. #include <fstream> // for size_t
  41. #include <iostream> // for std::cin
  42. #include <locale> // for std::locale::classic
  43. #include <memory> // for std::unique_ptr
  44. #include <set> // for std::pair
  45. #include <sstream> // for std::stringstream
  46. #include <vector> // for std::vector
  47. #ifdef HAVE_LIBCURL
  48. #include <curl/curl.h>
  49. #endif
  50. #include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
  51. #ifndef DISABLED_LEGACY_ENGINE
  52. #include "blobclass.h" // for ExtractFontName
  53. #endif
  54. #include "boxword.h" // for BoxWord
  55. #include "config_auto.h" // for PACKAGE_VERSION
  56. #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST
  57. #include "dawg_cache.h" // for DawgCache
  58. #include "dict.h" // for Dict
  59. #include "edgblob.h" // for extract_edges
  60. #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
  61. #include "environ.h" // for l_uint8
  62. #include "equationdetect.h" // for EquationDetect
  63. #include "errcode.h" // for ASSERT_HOST
  64. #include <tesseract/helpers.h> // for IntCastRounded, chomp_string
  65. #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
  66. #ifndef DISABLED_LEGACY_ENGINE
  67. #include "intfx.h" // for INT_FX_RESULT_STRUCT
  68. #endif
  69. #include "mutableiterator.h" // for MutableIterator
  70. #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight
  71. #include <tesseract/ocrclass.h> // for ETEXT_DESC
  72. #if defined(USE_OPENCL)
  73. #include "openclwrapper.h" // for OpenclDevice
  74. #endif
  75. #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId...
  76. #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
  77. #include "paragraphs.h" // for DetectParagraphs
  78. #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri...
  79. #include "pdblock.h" // for PDBLK
  80. #include "points.h" // for FCOORD
  81. #include "polyblk.h" // for POLY_BLOCK
  82. #include "rect.h" // for TBOX
  83. #include <tesseract/renderer.h> // for TessResultRenderer
  84. #include <tesseract/resultiterator.h> // for ResultIterator
  85. #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
  86. #include <tesseract/strngs.h> // for STRING
  87. #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
  88. #include "tesseractclass.h" // for Tesseract
  89. #include <tesseract/thresholder.h> // for ImageThresholder
  90. #include "tprintf.h" // for tprintf
  91. #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
  92. static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
  93. static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
  94. namespace tesseract {
  95. /** Minimum sensible image size to be worth running tesseract. */
  96. const int kMinRectSize = 10;
  97. /** Character returned when Tesseract couldn't recognize as anything. */
  98. const char kTesseractReject = '~';
  99. /** Character used by UNLV error counter as a reject. */
  100. const char kUNLVReject = '~';
  101. /** Character used by UNLV as a suspect marker. */
  102. const char kUNLVSuspect = '^';
  103. /**
  104. * Filename used for input image file, from which to derive a name to search
  105. * for a possible UNLV zone file, if none is specified by SetInputName.
  106. */
  107. static const char* kInputFile = "noname.tif";
  108. /**
  109. * Temp file used for storing current parameters before applying retry values.
  110. */
  111. static const char* kOldVarsFile = "failed_vars.txt";
  112. /** Max string length of an int. */
  113. const int kMaxIntSize = 22;
  114. /* Add all available languages recursively.
  115. */
  116. static void addAvailableLanguages(const STRING &datadir, const STRING &base,
  117. GenericVector<STRING>* langs)
  118. {
  119. const STRING base2 = (base.c_str()[0] == '\0') ? base : base + "/";
  120. const size_t extlen = sizeof(kTrainedDataSuffix);
  121. #ifdef _WIN32
  122. WIN32_FIND_DATA data;
  123. HANDLE handle = FindFirstFile((datadir + base2 + "*").c_str(), &data);
  124. if (handle != INVALID_HANDLE_VALUE) {
  125. BOOL result = TRUE;
  126. for (; result;) {
  127. char *name = data.cFileName;
  128. // Skip '.', '..', and hidden files
  129. if (name[0] != '.') {
  130. if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
  131. FILE_ATTRIBUTE_DIRECTORY) {
  132. addAvailableLanguages(datadir, base2 + name, langs);
  133. } else {
  134. size_t len = strlen(name);
  135. if (len > extlen && name[len - extlen] == '.' &&
  136. strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
  137. name[len - extlen] = '\0';
  138. langs->push_back(base2 + name);
  139. }
  140. }
  141. }
  142. result = FindNextFile(handle, &data);
  143. }
  144. FindClose(handle);
  145. }
  146. #else // _WIN32
  147. DIR* dir = opendir((datadir + base).c_str());
  148. if (dir != nullptr) {
  149. dirent *de;
  150. while ((de = readdir(dir))) {
  151. char *name = de->d_name;
  152. // Skip '.', '..', and hidden files
  153. if (name[0] != '.') {
  154. struct stat st;
  155. if (stat((datadir + base2 + name).c_str(), &st) == 0 &&
  156. (st.st_mode & S_IFDIR) == S_IFDIR) {
  157. addAvailableLanguages(datadir, base2 + name, langs);
  158. } else {
  159. size_t len = strlen(name);
  160. if (len > extlen && name[len - extlen] == '.' &&
  161. strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
  162. name[len - extlen] = '\0';
  163. langs->push_back(base2 + name);
  164. }
  165. }
  166. }
  167. }
  168. closedir(dir);
  169. }
  170. #endif
  171. }
  172. // Compare two STRING values (used for sorting).
  173. static int CompareSTRING(const void* p1, const void* p2) {
  174. const auto* s1 = static_cast<const STRING*>(p1);
  175. const auto* s2 = static_cast<const STRING*>(p2);
  176. return strcmp(s1->c_str(), s2->c_str());
  177. }
  178. TessBaseAPI::TessBaseAPI()
  179. : tesseract_(nullptr),
  180. osd_tesseract_(nullptr),
  181. equ_detect_(nullptr),
  182. reader_(nullptr),
  183. // Thresholder is initialized to nullptr here, but will be set before use by:
  184. // A constructor of a derived API, SetThresholder(), or
  185. // created implicitly when used in InternalSetImage.
  186. thresholder_(nullptr),
  187. paragraph_models_(nullptr),
  188. block_list_(nullptr),
  189. page_res_(nullptr),
  190. input_file_(nullptr),
  191. output_file_(nullptr),
  192. datapath_(nullptr),
  193. language_(nullptr),
  194. last_oem_requested_(OEM_DEFAULT),
  195. recognition_done_(false),
  196. truth_cb_(nullptr),
  197. rect_left_(0),
  198. rect_top_(0),
  199. rect_width_(0),
  200. rect_height_(0),
  201. image_width_(0),
  202. image_height_(0) {
  203. #if defined(DEBUG)
  204. // The Tesseract executables would use the "C" locale by default,
  205. // but other software which is linked against the Tesseract library
  206. // typically uses the locale from the user's environment.
  207. // Here the default is overridden to allow debugging of potential
  208. // problems caused by the locale settings.
  209. // Use the current locale if building debug code.
  210. try {
  211. std::locale::global(std::locale(""));
  212. } catch (const std::runtime_error& ex) {
  213. fprintf(stderr, "Warning: Could not set the current locale\n");
  214. }
  215. #endif
  216. }
  217. TessBaseAPI::~TessBaseAPI() {
  218. End();
  219. }
  220. /**
  221. * Returns the version identifier as a static string. Do not delete.
  222. */
  223. const char* TessBaseAPI::Version() {
  224. return PACKAGE_VERSION;
  225. }
  226. /**
  227. * If compiled with OpenCL AND an available OpenCL
  228. * device is deemed faster than serial code, then
  229. * "device" is populated with the cl_device_id
  230. * and returns sizeof(cl_device_id)
  231. * otherwise *device=nullptr and returns 0.
  232. */
  233. size_t TessBaseAPI::getOpenCLDevice(void **data) {
  234. #ifdef USE_OPENCL
  235. ds_device device = OpenclDevice::getDeviceSelection();
  236. if (device.type == DS_DEVICE_OPENCL_DEVICE) {
  237. *data = new cl_device_id;
  238. memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
  239. return sizeof(cl_device_id);
  240. }
  241. #endif
  242. *data = nullptr;
  243. return 0;
  244. }
  245. /**
  246. * Set the name of the input file. Needed only for training and
  247. * loading a UNLV zone file.
  248. */
  249. void TessBaseAPI::SetInputName(const char* name) {
  250. if (input_file_ == nullptr)
  251. input_file_ = new STRING(name);
  252. else
  253. *input_file_ = name;
  254. }
  255. /** Set the name of the output files. Needed only for debugging. */
  256. void TessBaseAPI::SetOutputName(const char* name) {
  257. if (output_file_ == nullptr)
  258. output_file_ = new STRING(name);
  259. else
  260. *output_file_ = name;
  261. }
  262. bool TessBaseAPI::SetVariable(const char* name, const char* value) {
  263. if (tesseract_ == nullptr) tesseract_ = new Tesseract;
  264. return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
  265. tesseract_->params());
  266. }
  267. bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
  268. if (tesseract_ == nullptr) tesseract_ = new Tesseract;
  269. return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY,
  270. tesseract_->params());
  271. }
  272. bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
  273. auto *p = ParamUtils::FindParam<IntParam>(
  274. name, GlobalParams()->int_params, tesseract_->params()->int_params);
  275. if (p == nullptr) return false;
  276. *value = (int32_t)(*p);
  277. return true;
  278. }
  279. bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
  280. auto *p = ParamUtils::FindParam<BoolParam>(
  281. name, GlobalParams()->bool_params, tesseract_->params()->bool_params);
  282. if (p == nullptr) return false;
  283. *value = bool(*p);
  284. return true;
  285. }
  286. const char *TessBaseAPI::GetStringVariable(const char *name) const {
  287. auto *p = ParamUtils::FindParam<StringParam>(
  288. name, GlobalParams()->string_params, tesseract_->params()->string_params);
  289. return (p != nullptr) ? p->c_str() : nullptr;
  290. }
  291. bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
  292. auto *p = ParamUtils::FindParam<DoubleParam>(
  293. name, GlobalParams()->double_params, tesseract_->params()->double_params);
  294. if (p == nullptr) return false;
  295. *value = (double)(*p);
  296. return true;
  297. }
  298. /** Get value of named variable as a string, if it exists. */
  299. bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
  300. return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
  301. }
  302. /** Print Tesseract parameters to the given file. */
  303. void TessBaseAPI::PrintVariables(FILE *fp) const {
  304. ParamUtils::PrintParams(fp, tesseract_->params());
  305. }
  306. /**
  307. * The datapath must be the name of the data directory or
  308. * some other file in which the data directory resides (for instance argv[0].)
  309. * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
  310. * If numeric_mode is true, then only digits and Roman numerals will
  311. * be returned.
  312. * @return: 0 on success and -1 on initialization failure.
  313. */
  314. int TessBaseAPI::Init(const char* datapath, const char* language,
  315. OcrEngineMode oem, char **configs, int configs_size,
  316. const GenericVector<STRING> *vars_vec,
  317. const GenericVector<STRING> *vars_values,
  318. bool set_only_non_debug_params) {
  319. return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
  320. vars_values, set_only_non_debug_params, nullptr);
  321. }
  322. // In-memory version reads the traineddata file directly from the given
  323. // data[data_size] array. Also implements the version with a datapath in data,
  324. // flagged by data_size = 0.
  325. int TessBaseAPI::Init(const char* data, int data_size, const char* language,
  326. OcrEngineMode oem, char** configs, int configs_size,
  327. const GenericVector<STRING>* vars_vec,
  328. const GenericVector<STRING>* vars_values,
  329. bool set_only_non_debug_params, FileReader reader) {
  330. // Default language is "eng".
  331. if (language == nullptr) language = "eng";
  332. STRING datapath = data_size == 0 ? data : language;
  333. // If the datapath, OcrEngineMode or the language have changed - start again.
  334. // Note that the language_ field stores the last requested language that was
  335. // initialized successfully, while tesseract_->lang stores the language
  336. // actually used. They differ only if the requested language was nullptr, in
  337. // which case tesseract_->lang is set to the Tesseract default ("eng").
  338. if (tesseract_ != nullptr &&
  339. (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
  340. last_oem_requested_ != oem ||
  341. (*language_ != language && tesseract_->lang != language))) {
  342. delete tesseract_;
  343. tesseract_ = nullptr;
  344. }
  345. #ifdef USE_OPENCL
  346. OpenclDevice od;
  347. od.InitEnv();
  348. #endif
  349. bool reset_classifier = true;
  350. if (tesseract_ == nullptr) {
  351. reset_classifier = false;
  352. tesseract_ = new Tesseract;
  353. if (reader != nullptr) reader_ = reader;
  354. TessdataManager mgr(reader_);
  355. if (data_size != 0) {
  356. mgr.LoadMemBuffer(language, data, data_size);
  357. }
  358. if (tesseract_->init_tesseract(
  359. datapath.c_str(),
  360. output_file_ != nullptr ? output_file_->c_str() : nullptr,
  361. language, oem, configs, configs_size, vars_vec, vars_values,
  362. set_only_non_debug_params, &mgr) != 0) {
  363. return -1;
  364. }
  365. }
  366. // Update datapath and language requested for the last valid initialization.
  367. if (datapath_ == nullptr)
  368. datapath_ = new STRING(datapath);
  369. else
  370. *datapath_ = datapath;
  371. if ((strcmp(datapath_->c_str(), "") == 0) &&
  372. (strcmp(tesseract_->datadir.c_str(), "") != 0))
  373. *datapath_ = tesseract_->datadir;
  374. if (language_ == nullptr)
  375. language_ = new STRING(language);
  376. else
  377. *language_ = language;
  378. last_oem_requested_ = oem;
  379. #ifndef DISABLED_LEGACY_ENGINE
  380. // For same language and datapath, just reset the adaptive classifier.
  381. if (reset_classifier) {
  382. tesseract_->ResetAdaptiveClassifier();
  383. }
  384. #endif // ndef DISABLED_LEGACY_ENGINE
  385. return 0;
  386. }
  387. /**
  388. * Returns the languages string used in the last valid initialization.
  389. * If the last initialization specified "deu+hin" then that will be
  390. * returned. If hin loaded eng automatically as well, then that will
  391. * not be included in this list. To find the languages actually
  392. * loaded use GetLoadedLanguagesAsVector.
  393. * The returned string should NOT be deleted.
  394. */
  395. const char* TessBaseAPI::GetInitLanguagesAsString() const {
  396. return (language_ == nullptr || language_->c_str() == nullptr) ?
  397. "" : language_->c_str();
  398. }
  399. /**
  400. * Returns the loaded languages in the vector of STRINGs.
  401. * Includes all languages loaded by the last Init, including those loaded
  402. * as dependencies of other loaded languages.
  403. */
  404. void TessBaseAPI::GetLoadedLanguagesAsVector(
  405. GenericVector<STRING>* langs) const {
  406. langs->clear();
  407. if (tesseract_ != nullptr) {
  408. langs->push_back(tesseract_->lang);
  409. int num_subs = tesseract_->num_sub_langs();
  410. for (int i = 0; i < num_subs; ++i)
  411. langs->push_back(tesseract_->get_sub_lang(i)->lang);
  412. }
  413. }
  414. /**
  415. * Returns the available languages in the sorted vector of STRINGs.
  416. */
  417. void TessBaseAPI::GetAvailableLanguagesAsVector(
  418. GenericVector<STRING>* langs) const {
  419. langs->clear();
  420. if (tesseract_ != nullptr) {
  421. addAvailableLanguages(tesseract_->datadir, "", langs);
  422. langs->sort(CompareSTRING);
  423. }
  424. }
  425. //TODO(amit): Adapt to lstm
  426. #ifndef DISABLED_LEGACY_ENGINE
  427. /**
  428. * Init only the lang model component of Tesseract. The only functions
  429. * that work after this init are SetVariable and IsValidWord.
  430. * WARNING: temporary! This function will be removed from here and placed
  431. * in a separate API at some future time.
  432. */
  433. int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
  434. if (tesseract_ == nullptr)
  435. tesseract_ = new Tesseract;
  436. else
  437. ParamUtils::ResetToDefaults(tesseract_->params());
  438. TessdataManager mgr;
  439. return tesseract_->init_tesseract_lm(datapath, nullptr, language, &mgr);
  440. }
  441. #endif // ndef DISABLED_LEGACY_ENGINE
  442. /**
  443. * Init only for page layout analysis. Use only for calls to SetImage and
  444. * AnalysePage. Calls that attempt recognition will generate an error.
  445. */
  446. void TessBaseAPI::InitForAnalysePage() {
  447. if (tesseract_ == nullptr) {
  448. tesseract_ = new Tesseract;
  449. #ifndef DISABLED_LEGACY_ENGINE
  450. tesseract_->InitAdaptiveClassifier(nullptr);
  451. #endif
  452. }
  453. }
  454. /**
  455. * Read a "config" file containing a set of parameter name, value pairs.
  456. * Searches the standard places: tessdata/configs, tessdata/tessconfigs
  457. * and also accepts a relative or absolute path name.
  458. */
  459. void TessBaseAPI::ReadConfigFile(const char* filename) {
  460. tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
  461. }
  462. /** Same as above, but only set debug params from the given config file. */
  463. void TessBaseAPI::ReadDebugConfigFile(const char* filename) {
  464. tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
  465. }
  466. /**
  467. * Set the current page segmentation mode. Defaults to PSM_AUTO.
  468. * The mode is stored as an IntParam so it can also be modified by
  469. * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
  470. */
  471. void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
  472. if (tesseract_ == nullptr)
  473. tesseract_ = new Tesseract;
  474. tesseract_->tessedit_pageseg_mode.set_value(mode);
  475. }
  476. /** Return the current page segmentation mode. */
  477. PageSegMode TessBaseAPI::GetPageSegMode() const {
  478. if (tesseract_ == nullptr)
  479. return PSM_SINGLE_BLOCK;
  480. return static_cast<PageSegMode>(
  481. static_cast<int>(tesseract_->tessedit_pageseg_mode));
  482. }
  483. /**
  484. * Recognize a rectangle from an image and return the result as a string.
  485. * May be called many times for a single Init.
  486. * Currently has no error checking.
  487. * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
  488. * Palette color images will not work properly and must be converted to
  489. * 24 bit.
  490. * Binary images of 1 bit per pixel may also be given but they must be
  491. * byte packed with the MSB of the first byte being the first pixel, and a
  492. * one pixel is WHITE. For binary images set bytes_per_pixel=0.
  493. * The recognized text is returned as a char* which is coded
  494. * as UTF8 and must be freed with the delete [] operator.
  495. */
  496. char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
  497. int bytes_per_pixel,
  498. int bytes_per_line,
  499. int left, int top,
  500. int width, int height) {
  501. if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize)
  502. return nullptr; // Nothing worth doing.
  503. // Since this original api didn't give the exact size of the image,
  504. // we have to invent a reasonable value.
  505. int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
  506. SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
  507. bytes_per_pixel, bytes_per_line);
  508. SetRectangle(left, top, width, height);
  509. return GetUTF8Text();
  510. }
  511. #ifndef DISABLED_LEGACY_ENGINE
  512. /**
  513. * Call between pages or documents etc to free up memory and forget
  514. * adaptive data.
  515. */
  516. void TessBaseAPI::ClearAdaptiveClassifier() {
  517. if (tesseract_ == nullptr)
  518. return;
  519. tesseract_->ResetAdaptiveClassifier();
  520. tesseract_->ResetDocumentDictionary();
  521. }
  522. #endif // ndef DISABLED_LEGACY_ENGINE
  523. /**
  524. * Provide an image for Tesseract to recognize. Format is as
  525. * TesseractRect above. Copies the image buffer and converts to Pix.
  526. * SetImage clears all recognition results, and sets the rectangle to the
  527. * full image, so it may be followed immediately by a GetUTF8Text, and it
  528. * will automatically perform recognition.
  529. */
  530. void TessBaseAPI::SetImage(const unsigned char* imagedata,
  531. int width, int height,
  532. int bytes_per_pixel, int bytes_per_line) {
  533. if (InternalSetImage()) {
  534. thresholder_->SetImage(imagedata, width, height,
  535. bytes_per_pixel, bytes_per_line);
  536. SetInputImage(thresholder_->GetPixRect());
  537. }
  538. }
  539. void TessBaseAPI::SetSourceResolution(int ppi) {
  540. if (thresholder_)
  541. thresholder_->SetSourceYResolution(ppi);
  542. else
  543. tprintf("Please call SetImage before SetSourceResolution.\n");
  544. }
  545. /**
  546. * Provide an image for Tesseract to recognize. As with SetImage above,
  547. * Tesseract takes its own copy of the image, so it need not persist until
  548. * after Recognize.
  549. * Pix vs raw, which to use?
  550. * Use Pix where possible. Tesseract uses Pix as its internal representation
  551. * and it is therefore more efficient to provide a Pix directly.
  552. */
  553. void TessBaseAPI::SetImage(Pix* pix) {
  554. if (InternalSetImage()) {
  555. if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
  556. // remove alpha channel from png
  557. Pix* p1 = pixRemoveAlpha(pix);
  558. pixSetSpp(p1, 3);
  559. (void)pixCopy(pix, p1);
  560. pixDestroy(&p1);
  561. }
  562. thresholder_->SetImage(pix);
  563. SetInputImage(thresholder_->GetPixRect());
  564. }
  565. }
  566. /**
  567. * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
  568. * Each SetRectangle clears the recogntion results so multiple rectangles
  569. * can be recognized with the same image.
  570. */
  571. void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
  572. if (thresholder_ == nullptr)
  573. return;
  574. thresholder_->SetRectangle(left, top, width, height);
  575. ClearResults();
  576. }
  577. /**
  578. * ONLY available after SetImage if you have Leptonica installed.
  579. * Get a copy of the internal thresholded image from Tesseract.
  580. */
  581. Pix* TessBaseAPI::GetThresholdedImage() {
  582. if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr;
  583. if (tesseract_->pix_binary() == nullptr &&
  584. !Threshold(tesseract_->mutable_pix_binary())) {
  585. return nullptr;
  586. }
  587. return pixClone(tesseract_->pix_binary());
  588. }
  589. /**
  590. * Get the result of page layout analysis as a leptonica-style
  591. * Boxa, Pixa pair, in reading order.
  592. * Can be called before or after Recognize.
  593. */
  594. Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
  595. return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
  596. }
  597. /**
  598. * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
  599. * Can be called before or after Recognize.
  600. * If blockids is not nullptr, the block-id of each line is also returned as an
  601. * array of one element per line. delete [] after use.
  602. * If paraids is not nullptr, the paragraph-id of each line within its block is
  603. * also returned as an array of one element per line. delete [] after use.
  604. */
  605. Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
  606. Pixa** pixa, int** blockids, int** paraids) {
  607. return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
  608. pixa, blockids, paraids);
  609. }
  610. /**
  611. * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
  612. * pair, in reading order. Enables downstream handling of non-rectangular
  613. * regions.
  614. * Can be called before or after Recognize.
  615. * If blockids is not nullptr, the block-id of each line is also returned as an
  616. * array of one element per line. delete [] after use.
  617. */
  618. Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
  619. return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
  620. }
  621. /**
  622. * Get the words as a leptonica-style
  623. * Boxa, Pixa pair, in reading order.
  624. * Can be called before or after Recognize.
  625. */
  626. Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
  627. return GetComponentImages(RIL_WORD, true, pixa, nullptr);
  628. }
  629. /**
  630. * Gets the individual connected (text) components (created
  631. * after pages segmentation step, but before recognition)
  632. * as a leptonica-style Boxa, Pixa pair, in reading order.
  633. * Can be called before or after Recognize.
  634. */
  635. Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) {
  636. return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
  637. }
  638. /**
  639. * Get the given level kind of components (block, textline, word etc.) as a
  640. * leptonica-style Boxa, Pixa pair, in reading order.
  641. * Can be called before or after Recognize.
  642. * If blockids is not nullptr, the block-id of each component is also returned
  643. * as an array of one element per component. delete [] after use.
  644. * If text_only is true, then only text components are returned.
  645. */
  646. Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
  647. bool text_only, bool raw_image,
  648. const int raw_padding,
  649. Pixa** pixa, int** blockids,
  650. int** paraids) {
  651. PageIterator* page_it = GetIterator();
  652. if (page_it == nullptr)
  653. page_it = AnalyseLayout();
  654. if (page_it == nullptr)
  655. return nullptr; // Failed.
  656. // Count the components to get a size for the arrays.
  657. int component_count = 0;
  658. int left, top, right, bottom;
  659. if (raw_image) {
  660. // Get bounding box in original raw image with padding.
  661. do {
  662. if (page_it->BoundingBox(level, raw_padding,
  663. &left, &top, &right, &bottom) &&
  664. (!text_only || PTIsTextType(page_it->BlockType())))
  665. ++component_count;
  666. } while (page_it->Next(level));
  667. } else {
  668. // Get bounding box from binarized imaged. Note that this could be
  669. // differently scaled from the original image.
  670. do {
  671. if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
  672. (!text_only || PTIsTextType(page_it->BlockType())))
  673. ++component_count;
  674. } while (page_it->Next(level));
  675. }
  676. Boxa* boxa = boxaCreate(component_count);
  677. if (pixa != nullptr)
  678. *pixa = pixaCreate(component_count);
  679. if (blockids != nullptr)
  680. *blockids = new int[component_count];
  681. if (paraids != nullptr)
  682. *paraids = new int[component_count];
  683. int blockid = 0;
  684. int paraid = 0;
  685. int component_index = 0;
  686. page_it->Begin();
  687. do {
  688. bool got_bounding_box;
  689. if (raw_image) {
  690. got_bounding_box =
  691. page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
  692. } else {
  693. got_bounding_box =
  694. page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
  695. }
  696. if (got_bounding_box &&
  697. (!text_only || PTIsTextType(page_it->BlockType()))) {
  698. Box* lbox = boxCreate(left, top, right - left, bottom - top);
  699. boxaAddBox(boxa, lbox, L_INSERT);
  700. if (pixa != nullptr) {
  701. Pix* pix = nullptr;
  702. if (raw_image) {
  703. pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
  704. &top);
  705. } else {
  706. pix = page_it->GetBinaryImage(level);
  707. }
  708. pixaAddPix(*pixa, pix, L_INSERT);
  709. pixaAddBox(*pixa, lbox, L_CLONE);
  710. }
  711. if (paraids != nullptr) {
  712. (*paraids)[component_index] = paraid;
  713. if (page_it->IsAtFinalElement(RIL_PARA, level))
  714. ++paraid;
  715. }
  716. if (blockids != nullptr) {
  717. (*blockids)[component_index] = blockid;
  718. if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
  719. ++blockid;
  720. paraid = 0;
  721. }
  722. }
  723. ++component_index;
  724. }
  725. } while (page_it->Next(level));
  726. delete page_it;
  727. return boxa;
  728. }
  729. int TessBaseAPI::GetThresholdedImageScaleFactor() const {
  730. if (thresholder_ == nullptr) {
  731. return 0;
  732. }
  733. return thresholder_->GetScaleFactor();
  734. }
  735. /**
  736. * Runs page layout analysis in the mode set by SetPageSegMode.
  737. * May optionally be called prior to Recognize to get access to just
  738. * the page layout results. Returns an iterator to the results.
  739. * If merge_similar_words is true, words are combined where suitable for use
  740. * with a line recognizer. Use if you want to use AnalyseLayout to find the
  741. * textlines, and then want to process textline fragments with an external
  742. * line recognizer.
  743. * Returns nullptr on error or an empty page.
  744. * The returned iterator must be deleted after use.
  745. * WARNING! This class points to data held within the TessBaseAPI class, and
  746. * therefore can only be used while the TessBaseAPI class still exists and
  747. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  748. * DetectOS, or anything else that changes the internal PAGE_RES.
  749. */
  750. PageIterator* TessBaseAPI::AnalyseLayout() { return AnalyseLayout(false); }
  751. PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
  752. if (FindLines() == 0) {
  753. if (block_list_->empty())
  754. return nullptr; // The page was empty.
  755. page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
  756. DetectParagraphs(false);
  757. return new PageIterator(
  758. page_res_, tesseract_, thresholder_->GetScaleFactor(),
  759. thresholder_->GetScaledYResolution(),
  760. rect_left_, rect_top_, rect_width_, rect_height_);
  761. }
  762. return nullptr;
  763. }
  764. /**
  765. * Recognize the tesseract global image and return the result as Tesseract
  766. * internal structures.
  767. */
  768. int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
  769. if (tesseract_ == nullptr)
  770. return -1;
  771. if (FindLines() != 0)
  772. return -1;
  773. delete page_res_;
  774. if (block_list_->empty()) {
  775. page_res_ = new PAGE_RES(false, block_list_,
  776. &tesseract_->prev_word_best_choice_);
  777. return 0; // Empty page.
  778. }
  779. tesseract_->SetBlackAndWhitelist();
  780. recognition_done_ = true;
  781. #ifndef DISABLED_LEGACY_ENGINE
  782. if (tesseract_->tessedit_resegment_from_line_boxes) {
  783. page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
  784. } else if (tesseract_->tessedit_resegment_from_boxes) {
  785. page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
  786. } else
  787. #endif // ndef DISABLED_LEGACY_ENGINE
  788. {
  789. page_res_ = new PAGE_RES(tesseract_->AnyLSTMLang(),
  790. block_list_, &tesseract_->prev_word_best_choice_);
  791. }
  792. if (page_res_ == nullptr) {
  793. return -1;
  794. }
  795. if (tesseract_->tessedit_train_line_recognizer) {
  796. if (!tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_)) {
  797. return -1;
  798. }
  799. tesseract_->CorrectClassifyWords(page_res_);
  800. return 0;
  801. }
  802. #ifndef DISABLED_LEGACY_ENGINE
  803. if (tesseract_->tessedit_make_boxes_from_boxes) {
  804. tesseract_->CorrectClassifyWords(page_res_);
  805. return 0;
  806. }
  807. #endif // ndef DISABLED_LEGACY_ENGINE
  808. if (truth_cb_ != nullptr) {
  809. tesseract_->wordrec_run_blamer.set_value(true);
  810. auto *page_it = new PageIterator(
  811. page_res_, tesseract_, thresholder_->GetScaleFactor(),
  812. thresholder_->GetScaledYResolution(),
  813. rect_left_, rect_top_, rect_width_, rect_height_);
  814. truth_cb_(tesseract_->getDict().getUnicharset(),
  815. image_height_, page_it, this->tesseract()->pix_grey());
  816. delete page_it;
  817. }
  818. int result = 0;
  819. if (tesseract_->interactive_display_mode) {
  820. #ifndef GRAPHICS_DISABLED
  821. tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
  822. #endif // GRAPHICS_DISABLED
  823. // The page_res is invalid after an interactive session, so cleanup
  824. // in a way that lets us continue to the next page without crashing.
  825. delete page_res_;
  826. page_res_ = nullptr;
  827. return -1;
  828. #ifndef DISABLED_LEGACY_ENGINE
  829. } else if (tesseract_->tessedit_train_from_boxes) {
  830. STRING fontname;
  831. ExtractFontName(*output_file_, &fontname);
  832. tesseract_->ApplyBoxTraining(fontname, page_res_);
  833. } else if (tesseract_->tessedit_ambigs_training) {
  834. FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
  835. // OCR the page segmented into words by tesseract.
  836. tesseract_->recog_training_segmented(
  837. *input_file_, page_res_, monitor, training_output_file);
  838. fclose(training_output_file);
  839. #endif // ndef DISABLED_LEGACY_ENGINE
  840. } else {
  841. // Now run the main recognition.
  842. bool wait_for_text = true;
  843. GetBoolVariable("paragraph_text_based", &wait_for_text);
  844. if (!wait_for_text) DetectParagraphs(false);
  845. if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
  846. if (wait_for_text) DetectParagraphs(true);
  847. } else {
  848. result = -1;
  849. }
  850. }
  851. return result;
  852. }
  853. #ifndef DISABLED_LEGACY_ENGINE
  854. /** Tests the chopper by exhaustively running chop_one_blob. */
  855. int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
  856. if (tesseract_ == nullptr)
  857. return -1;
  858. if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
  859. tprintf("Please call SetImage before attempting recognition.\n");
  860. return -1;
  861. }
  862. if (page_res_ != nullptr)
  863. ClearResults();
  864. if (FindLines() != 0)
  865. return -1;
  866. // Additional conditions under which chopper test cannot be run
  867. if (tesseract_->interactive_display_mode) return -1;
  868. recognition_done_ = true;
  869. page_res_ = new PAGE_RES(false, block_list_,
  870. &(tesseract_->prev_word_best_choice_));
  871. PAGE_RES_IT page_res_it(page_res_);
  872. while (page_res_it.word() != nullptr) {
  873. WERD_RES *word_res = page_res_it.word();
  874. GenericVector<TBOX> boxes;
  875. tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
  876. page_res_it.row()->row, word_res);
  877. page_res_it.forward();
  878. }
  879. return 0;
  880. }
  881. #endif // ndef DISABLED_LEGACY_ENGINE
  882. // Takes ownership of the input pix.
  883. void TessBaseAPI::SetInputImage(Pix* pix) { tesseract_->set_pix_original(pix); }
  884. Pix* TessBaseAPI::GetInputImage() { return tesseract_->pix_original(); }
  885. const char * TessBaseAPI::GetInputName() {
  886. if (input_file_)
  887. return input_file_->c_str();
  888. return nullptr;
  889. }
  890. const char * TessBaseAPI::GetDatapath() {
  891. return tesseract_->datadir.c_str();
  892. }
  893. int TessBaseAPI::GetSourceYResolution() {
  894. return thresholder_->GetSourceYResolution();
  895. }
  896. // If flist exists, get data from there. Otherwise get data from buf.
  897. // Seems convoluted, but is the easiest way I know of to meet multiple
  898. // goals. Support streaming from stdin, and also work on platforms
  899. // lacking fmemopen.
  900. bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
  901. STRING *buf,
  902. const char* retry_config,
  903. int timeout_millisec,
  904. TessResultRenderer* renderer,
  905. int tessedit_page_number) {
  906. if (!flist && !buf) return false;
  907. int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
  908. char pagename[MAX_PATH];
  909. GenericVector<STRING> lines;
  910. if (!flist) {
  911. buf->split('\n', &lines);
  912. if (lines.empty()) return false;
  913. }
  914. // Skip to the requested page number.
  915. for (int i = 0; i < page; i++) {
  916. if (flist) {
  917. if (fgets(pagename, sizeof(pagename), flist) == nullptr) break;
  918. }
  919. }
  920. // Begin producing output
  921. if (renderer && !renderer->BeginDocument(document_title.c_str())) {
  922. return false;
  923. }
  924. // Loop over all pages - or just the requested one
  925. while (true) {
  926. if (flist) {
  927. if (fgets(pagename, sizeof(pagename), flist) == nullptr) break;
  928. } else {
  929. if (page >= lines.size()) break;
  930. snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
  931. }
  932. chomp_string(pagename);
  933. Pix *pix = pixRead(pagename);
  934. if (pix == nullptr) {
  935. tprintf("Image file %s cannot be read!\n", pagename);
  936. return false;
  937. }
  938. tprintf("Page %d : %s\n", page, pagename);
  939. bool r = ProcessPage(pix, page, pagename, retry_config,
  940. timeout_millisec, renderer);
  941. pixDestroy(&pix);
  942. if (!r) return false;
  943. if (tessedit_page_number >= 0) break;
  944. ++page;
  945. }
  946. // Finish producing output
  947. if (renderer && !renderer->EndDocument()) {
  948. return false;
  949. }
  950. return true;
  951. }
  952. bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
  953. size_t size,
  954. const char* filename,
  955. const char* retry_config,
  956. int timeout_millisec,
  957. TessResultRenderer* renderer,
  958. int tessedit_page_number) {
  959. #ifndef ANDROID_BUILD
  960. Pix *pix = nullptr;
  961. int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
  962. size_t offset = 0;
  963. for (; ; ++page) {
  964. if (tessedit_page_number >= 0) {
  965. page = tessedit_page_number;
  966. pix = (data) ? pixReadMemTiff(data, size, page)
  967. : pixReadTiff(filename, page);
  968. } else {
  969. pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
  970. : pixReadFromMultipageTiff(filename, &offset);
  971. }
  972. if (pix == nullptr) break;
  973. tprintf("Page %d\n", page + 1);
  974. char page_str[kMaxIntSize];
  975. snprintf(page_str, kMaxIntSize - 1, "%d", page);
  976. SetVariable("applybox_page", page_str);
  977. bool r = ProcessPage(pix, page, filename, retry_config,
  978. timeout_millisec, renderer);
  979. pixDestroy(&pix);
  980. if (!r) return false;
  981. if (tessedit_page_number >= 0) break;
  982. if (!offset) break;
  983. }
  984. return true;
  985. #else
  986. return false;
  987. #endif
  988. }
  989. // Master ProcessPages calls ProcessPagesInternal and then does any post-
  990. // processing required due to being in a training mode.
  991. bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
  992. int timeout_millisec,
  993. TessResultRenderer* renderer) {
  994. bool result =
  995. ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
  996. #ifndef DISABLED_LEGACY_ENGINE
  997. if (result) {
  998. if (tesseract_->tessedit_train_from_boxes &&
  999. !tesseract_->WriteTRFile(*output_file_)) {
  1000. tprintf("Write of TR file failed: %s\n", output_file_->c_str());
  1001. return false;
  1002. }
  1003. }
  1004. #endif // ndef DISABLED_LEGACY_ENGINE
  1005. return result;
  1006. }
  1007. static size_t
  1008. WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
  1009. {
  1010. size = size * nmemb;
  1011. std::string* buf = reinterpret_cast<std::string*>(userp);
  1012. buf->append(reinterpret_cast<const char*>(contents), size);
  1013. return size;
  1014. }
  1015. // In the ideal scenario, Tesseract will start working on data as soon
  1016. // as it can. For example, if you stream a filelist through stdin, we
  1017. // should start the OCR process as soon as the first filename is
  1018. // available. This is particularly useful when hooking Tesseract up to
  1019. // slow hardware such as a book scanning machine.
  1020. //
  1021. // Unfortunately there are tradeoffs. You can't seek on stdin. That
  1022. // makes automatic detection of datatype (TIFF? filelist? PNG?)
  1023. // impractical. So we support a command line flag to explicitly
  1024. // identify the scenario that really matters: filelists on
  1025. // stdin. We'll still do our best if the user likes pipes.
  1026. bool TessBaseAPI::ProcessPagesInternal(const char* filename,
  1027. const char* retry_config,
  1028. int timeout_millisec,
  1029. TessResultRenderer* renderer) {
  1030. bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
  1031. if (stdInput) {
  1032. #ifdef WIN32
  1033. if (_setmode(_fileno(stdin), _O_BINARY) == -1)
  1034. tprintf("ERROR: cin to binary: %s", strerror(errno));
  1035. #endif // WIN32
  1036. }
  1037. if (stream_filelist) {
  1038. return ProcessPagesFileList(stdin, nullptr, retry_config,
  1039. timeout_millisec, renderer,
  1040. tesseract_->tessedit_page_number);
  1041. }
  1042. // At this point we are officially in autodection territory.
  1043. // That means any data in stdin must be buffered, to make it
  1044. // seekable.
  1045. std::string buf;
  1046. const l_uint8 *data = nullptr;
  1047. if (stdInput) {
  1048. buf.assign((std::istreambuf_iterator<char>(std::cin)),
  1049. (std::istreambuf_iterator<char>()));
  1050. data = reinterpret_cast<const l_uint8 *>(buf.data());
  1051. } else if (strncmp(filename, "http:", 5) == 0 ||
  1052. strncmp(filename, "https:", 6) == 0 ) {
  1053. // Get image or image list by URL.
  1054. #ifdef HAVE_LIBCURL
  1055. CURL* curl = curl_easy_init();
  1056. if (curl == nullptr) {
  1057. fprintf(stderr, "Error, curl_easy_init failed\n");
  1058. return false;
  1059. } else {
  1060. CURLcode curlcode;
  1061. auto error = [curl, &curlcode](const char* function) {
  1062. fprintf(stderr, "Error, %s failed with error %s\n", function,
  1063. curl_easy_strerror(curlcode));
  1064. curl_easy_cleanup(curl);
  1065. return false;
  1066. };
  1067. curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
  1068. if (curlcode != CURLE_OK) {
  1069. return error("curl_easy_setopt");
  1070. }
  1071. curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
  1072. if (curlcode != CURLE_OK) {
  1073. return error("curl_easy_setopt");
  1074. }
  1075. curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
  1076. if (curlcode != CURLE_OK) {
  1077. return error("curl_easy_setopt");
  1078. }
  1079. curlcode = curl_easy_perform(curl);
  1080. if (curlcode != CURLE_OK) {
  1081. return error("curl_easy_perform");
  1082. }
  1083. curl_easy_cleanup(curl);
  1084. data = reinterpret_cast<const l_uint8 *>(buf.data());
  1085. }
  1086. #else
  1087. fprintf(stderr, "Error, this tesseract has no URL support\n");
  1088. return false;
  1089. #endif
  1090. } else {
  1091. // Check whether the input file can be read.
  1092. if (FILE* file = fopen(filename, "rb")) {
  1093. fclose(file);
  1094. } else {
  1095. fprintf(stderr, "Error, cannot read input file %s: %s\n",
  1096. filename, strerror(errno));
  1097. return false;
  1098. }
  1099. }
  1100. // Here is our autodetection
  1101. int format;
  1102. int r = (data != nullptr) ?
  1103. findFileFormatBuffer(data, &format) :
  1104. findFileFormat(filename, &format);
  1105. // Maybe we have a filelist
  1106. if (r != 0 || format == IFF_UNKNOWN) {
  1107. STRING s;
  1108. if (data != nullptr) {
  1109. s = buf.c_str();
  1110. } else {
  1111. std::ifstream t(filename);
  1112. std::string u((std::istreambuf_iterator<char>(t)),
  1113. std::istreambuf_iterator<char>());
  1114. s = u.c_str();
  1115. }
  1116. return ProcessPagesFileList(nullptr, &s, retry_config,
  1117. timeout_millisec, renderer,
  1118. tesseract_->tessedit_page_number);
  1119. }
  1120. // Maybe we have a TIFF which is potentially multipage
  1121. bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
  1122. format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
  1123. format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
  1124. #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
  1125. format == IFF_TIFF_JPEG ||
  1126. #endif
  1127. format == IFF_TIFF_ZIP);
  1128. // Fail early if we can, before producing any output
  1129. Pix *pix = nullptr;
  1130. if (!tiff) {
  1131. pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
  1132. if (pix == nullptr) {
  1133. return false;
  1134. }
  1135. }
  1136. // Begin the output
  1137. if (renderer && !renderer->BeginDocument(document_title.c_str())) {
  1138. pixDestroy(&pix);
  1139. return false;
  1140. }
  1141. // Produce output
  1142. r = (tiff) ?
  1143. ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
  1144. timeout_millisec, renderer,
  1145. tesseract_->tessedit_page_number) :
  1146. ProcessPage(pix, 0, filename, retry_config,
  1147. timeout_millisec, renderer);
  1148. // Clean up memory as needed
  1149. pixDestroy(&pix);
  1150. // End the output
  1151. if (!r || (renderer && !renderer->EndDocument())) {
  1152. return false;
  1153. }
  1154. return true;
  1155. }
  1156. bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
  1157. const char* retry_config, int timeout_millisec,
  1158. TessResultRenderer* renderer) {
  1159. SetInputName(filename);
  1160. SetImage(pix);
  1161. bool failed = false;
  1162. if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
  1163. // Disabled character recognition
  1164. PageIterator* it = AnalyseLayout();
  1165. if (it == nullptr) {
  1166. failed = true;
  1167. } else {
  1168. delete it;
  1169. }
  1170. } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
  1171. failed = FindLines() != 0;
  1172. } else if (timeout_millisec > 0) {
  1173. // Running with a timeout.
  1174. ETEXT_DESC monitor;
  1175. monitor.cancel = nullptr;
  1176. monitor.cancel_this = nullptr;
  1177. monitor.set_deadline_msecs(timeout_millisec);
  1178. // Now run the main recognition.
  1179. failed = Recognize(&monitor) < 0;
  1180. } else {
  1181. // Normal layout and character recognition with no timeout.
  1182. failed = Recognize(nullptr) < 0;
  1183. }
  1184. if (tesseract_->tessedit_write_images) {
  1185. #ifndef ANDROID_BUILD
  1186. Pix* page_pix = GetThresholdedImage();
  1187. pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
  1188. #endif // ANDROID_BUILD
  1189. }
  1190. if (failed && retry_config != nullptr && retry_config[0] != '\0') {
  1191. // Save current config variables before switching modes.
  1192. FILE* fp = fopen(kOldVarsFile, "wb");
  1193. if (fp == nullptr) {
  1194. tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
  1195. } else {
  1196. PrintVariables(fp);
  1197. fclose(fp);
  1198. }
  1199. // Switch to alternate mode for retry.
  1200. ReadConfigFile(retry_config);
  1201. SetImage(pix);
  1202. Recognize(nullptr);
  1203. // Restore saved config variables.
  1204. ReadConfigFile(kOldVarsFile);
  1205. }
  1206. if (renderer && !failed) {
  1207. failed = !renderer->AddImage(this);
  1208. }
  1209. return !failed;
  1210. }
  1211. /**
  1212. * Get a left-to-right iterator to the results of LayoutAnalysis and/or
  1213. * Recognize. The returned iterator must be deleted after use.
  1214. */
  1215. LTRResultIterator* TessBaseAPI::GetLTRIterator() {
  1216. if (tesseract_ == nullptr || page_res_ == nullptr)
  1217. return nullptr;
  1218. return new LTRResultIterator(
  1219. page_res_, tesseract_,
  1220. thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
  1221. rect_left_, rect_top_, rect_width_, rect_height_);
  1222. }
  1223. /**
  1224. * Get a reading-order iterator to the results of LayoutAnalysis and/or
  1225. * Recognize. The returned iterator must be deleted after use.
  1226. * WARNING! This class points to data held within the TessBaseAPI class, and
  1227. * therefore can only be used while the TessBaseAPI class still exists and
  1228. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  1229. * DetectOS, or anything else that changes the internal PAGE_RES.
  1230. */
  1231. ResultIterator* TessBaseAPI::GetIterator() {
  1232. if (tesseract_ == nullptr || page_res_ == nullptr)
  1233. return nullptr;
  1234. return ResultIterator::StartOfParagraph(LTRResultIterator(
  1235. page_res_, tesseract_,
  1236. thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
  1237. rect_left_, rect_top_, rect_width_, rect_height_));
  1238. }
  1239. /**
  1240. * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
  1241. * The returned iterator must be deleted after use.
  1242. * WARNING! This class points to data held within the TessBaseAPI class, and
  1243. * therefore can only be used while the TessBaseAPI class still exists and
  1244. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  1245. * DetectOS, or anything else that changes the internal PAGE_RES.
  1246. */
  1247. MutableIterator* TessBaseAPI::GetMutableIterator() {
  1248. if (tesseract_ == nullptr || page_res_ == nullptr)
  1249. return nullptr;
  1250. return new MutableIterator(page_res_, tesseract_,
  1251. thresholder_->GetScaleFactor(),
  1252. thresholder_->GetScaledYResolution(),
  1253. rect_left_, rect_top_, rect_width_, rect_height_);
  1254. }
  1255. /** Make a text string from the internal data structures. */
  1256. char* TessBaseAPI::GetUTF8Text() {
  1257. if (tesseract_ == nullptr ||
  1258. (!recognition_done_ && Recognize(nullptr) < 0))
  1259. return nullptr;
  1260. STRING text("");
  1261. ResultIterator *it = GetIterator();
  1262. do {
  1263. if (it->Empty(RIL_PARA)) continue;
  1264. const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
  1265. text += para_text.get();
  1266. } while (it->Next(RIL_PARA));
  1267. char* result = new char[text.length() + 1];
  1268. strncpy(result, text.c_str(), text.length() + 1);
  1269. delete it;
  1270. return result;
  1271. }
  1272. static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
  1273. STRING* text) {
  1274. int left, top, right, bottom;
  1275. it->BoundingBox(level, &left, &top, &right, &bottom);
  1276. text->add_str_int("\t", left);
  1277. text->add_str_int("\t", top);
  1278. text->add_str_int("\t", right - left);
  1279. text->add_str_int("\t", bottom - top);
  1280. }
  1281. /**
  1282. * Make a TSV-formatted string from the internal data structures.
  1283. * page_number is 0-based but will appear in the output as 1-based.
  1284. * Returned string must be freed with the delete [] operator.
  1285. */
  1286. char* TessBaseAPI::GetTSVText(int page_number) {
  1287. if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
  1288. return nullptr;
  1289. int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
  1290. int page_id = page_number + 1; // we use 1-based page numbers.
  1291. STRING tsv_str("");
  1292. int page_num = page_id;
  1293. int block_num = 0;
  1294. int par_num = 0;
  1295. int line_num = 0;
  1296. int word_num = 0;
  1297. tsv_str.add_str_int("1\t", page_num); // level 1 - page
  1298. tsv_str.add_str_int("\t", block_num);
  1299. tsv_str.add_str_int("\t", par_num);
  1300. tsv_str.add_str_int("\t", line_num);
  1301. tsv_str.add_str_int("\t", word_num);
  1302. tsv_str.add_str_int("\t", rect_left_);
  1303. tsv_str.add_str_int("\t", rect_top_);
  1304. tsv_str.add_str_int("\t", rect_width_);
  1305. tsv_str.add_str_int("\t", rect_height_);
  1306. tsv_str += "\t-1\t\n";
  1307. ResultIterator* res_it = GetIterator();
  1308. while (!res_it->Empty(RIL_BLOCK)) {
  1309. if (res_it->Empty(RIL_WORD)) {
  1310. res_it->Next(RIL_WORD);
  1311. continue;
  1312. }
  1313. // Add rows for any new block/paragraph/textline.
  1314. if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
  1315. block_num++;
  1316. par_num = 0;
  1317. line_num = 0;
  1318. word_num = 0;
  1319. tsv_str.add_str_int("2\t", page_num); // level 2 - block
  1320. tsv_str.add_str_int("\t", block_num);
  1321. tsv_str.add_str_int("\t", par_num);
  1322. tsv_str.add_str_int("\t", line_num);
  1323. tsv_str.add_str_int("\t", word_num);
  1324. AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
  1325. tsv_str += "\t-1\t\n"; // end of row for block
  1326. }
  1327. if (res_it->IsAtBeginningOf(RIL_PARA)) {
  1328. par_num++;
  1329. line_num = 0;
  1330. word_num = 0;
  1331. tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
  1332. tsv_str.add_str_int("\t", block_num);
  1333. tsv_str.add_str_int("\t", par_num);
  1334. tsv_str.add_str_int("\t", line_num);
  1335. tsv_str.add_str_int("\t", word_num);
  1336. AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
  1337. tsv_str += "\t-1\t\n"; // end of row for para
  1338. }
  1339. if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
  1340. line_num++;
  1341. word_num = 0;
  1342. tsv_str.add_str_int("4\t", page_num); // level 4 - line
  1343. tsv_str.add_str_int("\t", block_num);
  1344. tsv_str.add_str_int("\t", par_num);
  1345. tsv_str.add_str_int("\t", line_num);
  1346. tsv_str.add_str_int("\t", word_num);
  1347. AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
  1348. tsv_str += "\t-1\t\n"; // end of row for line
  1349. }
  1350. // Now, process the word...
  1351. int left, top, right, bottom;
  1352. res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
  1353. word_num++;
  1354. tsv_str.add_str_int("5\t", page_num); // level 5 - word
  1355. tsv_str.add_str_int("\t", block_num);
  1356. tsv_str.add_str_int("\t", par_num);
  1357. tsv_str.add_str_int("\t", line_num);
  1358. tsv_str.add_str_int("\t", word_num);
  1359. tsv_str.add_str_int("\t", left);
  1360. tsv_str.add_str_int("\t", top);
  1361. tsv_str.add_str_int("\t", right - left);
  1362. tsv_str.add_str_int("\t", bottom - top);
  1363. tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
  1364. tsv_str += "\t";
  1365. // Increment counts if at end of block/paragraph/textline.
  1366. if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
  1367. if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
  1368. if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
  1369. do {
  1370. tsv_str +=
  1371. std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
  1372. res_it->Next(RIL_SYMBOL);
  1373. } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
  1374. tsv_str += "\n"; // end of row
  1375. wcnt++;
  1376. }
  1377. char* ret = new char[tsv_str.length() + 1];
  1378. strcpy(ret, tsv_str.c_str());
  1379. delete res_it;
  1380. return ret;
  1381. }
  1382. /** The 5 numbers output for each box (the usual 4 and a page number.) */
  1383. const int kNumbersPerBlob = 5;
  1384. /**
  1385. * The number of bytes taken by each number. Since we use int16_t for ICOORD,
  1386. * assume only 5 digits max.
  1387. */
  1388. const int kBytesPerNumber = 5;
  1389. /**
  1390. * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
  1391. * * kNumbersPerBlob plus the newline. Add to this the
  1392. * original UTF8 characters, and one kMaxBytesPerLine for safety.
  1393. */
  1394. const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
  1395. /** Max bytes in the decimal representation of int64_t. */
  1396. const int kBytesPer64BitNumber = 20;
  1397. /**
  1398. * A maximal single box could occupy kNumbersPerBlob numbers at
  1399. * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
  1400. * space plus the newline and the maximum length of a UNICHAR.
  1401. * Test against this on each iteration for safety.
  1402. */
  1403. const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
  1404. UNICHAR_LEN;
  1405. /**
  1406. * The recognized text is returned as a char* which is coded
  1407. * as a UTF8 box file.
  1408. * page_number is a 0-base page index that will appear in the box file.
  1409. * Returned string must be freed with the delete [] operator.
  1410. */
  1411. char* TessBaseAPI::GetBoxText(int page_number) {
  1412. if (tesseract_ == nullptr ||
  1413. (!recognition_done_ && Recognize(nullptr) < 0))
  1414. return nullptr;
  1415. int blob_count;
  1416. int utf8_length = TextLength(&blob_count);
  1417. int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
  1418. kMaxBytesPerLine;
  1419. char* result = new char[total_length];
  1420. result[0] = '\0';
  1421. int output_length = 0;
  1422. LTRResultIterator* it = GetLTRIterator();
  1423. do {
  1424. int left, top, right, bottom;
  1425. if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
  1426. const std::unique_ptr</*non-const*/ char[]> text(
  1427. it->GetUTF8Text(RIL_SYMBOL));
  1428. // Tesseract uses space for recognition failure. Fix to a reject
  1429. // character, kTesseractReject so we don't create illegal box files.
  1430. for (int i = 0; text[i] != '\0'; ++i) {
  1431. if (text[i] == ' ')
  1432. text[i] = kTesseractReject;
  1433. }
  1434. snprintf(result + output_length, total_length - output_length,
  1435. "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
  1436. right, image_height_ - top, page_number);
  1437. output_length += strlen(result + output_length);
  1438. // Just in case...
  1439. if (output_length + kMaxBytesPerLine > total_length)
  1440. break;
  1441. }
  1442. } while (it->Next(RIL_SYMBOL));
  1443. delete it;
  1444. return result;
  1445. }
  1446. /**
  1447. * Conversion table for non-latin characters.
  1448. * Maps characters out of the latin set into the latin set.
  1449. * TODO(rays) incorporate this translation into unicharset.
  1450. */
  1451. const int kUniChs[] = {
  1452. 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
  1453. };
  1454. /** Latin chars corresponding to the unicode chars above. */
  1455. const int kLatinChs[] = {
  1456. 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
  1457. };
  1458. /**
  1459. * The recognized text is returned as a char* which is coded
  1460. * as UNLV format Latin-1 with specific reject and suspect codes.
  1461. * Returned string must be freed with the delete [] operator.
  1462. */
  1463. char* TessBaseAPI::GetUNLVText() {
  1464. if (tesseract_ == nullptr ||
  1465. (!recognition_done_ && Recognize(nullptr) < 0))
  1466. return nullptr;
  1467. bool tilde_crunch_written = false;
  1468. bool last_char_was_newline = true;
  1469. bool last_char_was_tilde = false;
  1470. int total_length = TextLength(nullptr);
  1471. PAGE_RES_IT page_res_it(page_res_);
  1472. char* result = new char[total_length];
  1473. char* ptr = result;
  1474. for (page_res_it.restart_page(); page_res_it.word () != nullptr;
  1475. page_res_it.forward()) {
  1476. WERD_RES *word = page_res_it.word();
  1477. // Process the current word.
  1478. if (word->unlv_crunch_mode != CR_NONE) {
  1479. if (word->unlv_crunch_mode != CR_DELETE &&
  1480. (!tilde_crunch_written ||
  1481. (word->unlv_crunch_mode == CR_KEEP_SPACE &&
  1482. word->word->space() > 0 &&
  1483. !word->word->flag(W_FUZZY_NON) &&
  1484. !word->word->flag(W_FUZZY_SP)))) {
  1485. if (!word->word->flag(W_BOL) &&
  1486. word->word->space() > 0 &&
  1487. !word->word->flag(W_FUZZY_NON) &&
  1488. !word->word->flag(W_FUZZY_SP)) {
  1489. /* Write a space to separate from preceding good text */
  1490. *ptr++ = ' ';
  1491. last_char_was_tilde = false;
  1492. }
  1493. if (!last_char_was_tilde) {
  1494. // Write a reject char.
  1495. last_char_was_tilde = true;
  1496. *ptr++ = kUNLVReject;
  1497. tilde_crunch_written = true;
  1498. last_char_was_newline = false;
  1499. }
  1500. }
  1501. } else {
  1502. // NORMAL PROCESSING of non tilde crunched words.
  1503. tilde_crunch_written = false;
  1504. tesseract_->set_unlv_suspects(word);
  1505. const char* wordstr = word->best_choice->unichar_string().c_str();
  1506. const STRING& lengths = word->best_choice->unichar_lengths();
  1507. int length = lengths.length();
  1508. int i = 0;
  1509. int offset = 0;
  1510. if (last_char_was_tilde &&
  1511. word->word->space() == 0 && wordstr[offset] == ' ') {
  1512. // Prevent adjacent tilde across words - we know that adjacent tildes
  1513. // within words have been removed.
  1514. // Skip the first character.
  1515. offset = lengths[i++];
  1516. }
  1517. if (i < length && wordstr[offset] != 0) {
  1518. if (!last_char_was_newline)
  1519. *ptr++ = ' ';
  1520. else
  1521. last_char_was_newline = false;
  1522. for (; i < length; offset += lengths[i++]) {
  1523. if (wordstr[offset] == ' ' ||
  1524. wordstr[offset] == kTesseractReject) {
  1525. *ptr++ = kUNLVReject;
  1526. last_char_was_tilde = true;
  1527. } else {
  1528. if (word->reject_map[i].rejected())
  1529. *ptr++ = kUNLVSuspect;
  1530. UNICHAR ch(wordstr + offset, lengths[i]);
  1531. int uni_ch = ch.first_uni();
  1532. for (int j = 0; kUniChs[j] != 0; ++j) {
  1533. if (kUniChs[j] == uni_ch) {
  1534. uni_ch = kLatinChs[j];
  1535. break;
  1536. }
  1537. }
  1538. if (uni_ch <= 0xff) {
  1539. *ptr++ = static_cast<char>(uni_ch);
  1540. last_char_was_tilde = false;
  1541. } else {
  1542. *ptr++ = kUNLVReject;
  1543. last_char_was_tilde = true;
  1544. }
  1545. }
  1546. }
  1547. }
  1548. }
  1549. if (word->word->flag(W_EOL) && !last_char_was_newline) {
  1550. /* Add a new line output */
  1551. *ptr++ = '\n';
  1552. tilde_crunch_written = false;
  1553. last_char_was_newline = true;
  1554. last_char_was_tilde = false;
  1555. }
  1556. }
  1557. *ptr++ = '\n';
  1558. *ptr = '\0';
  1559. return result;
  1560. }
  1561. #ifndef DISABLED_LEGACY_ENGINE
  1562. /**
  1563. * Detect the orientation of the input image and apparent script (alphabet).
  1564. * orient_deg is the detected clockwise rotation of the input image in degrees
  1565. * (0, 90, 180, 270)
  1566. * orient_conf is the confidence (15.0 is reasonably confident)
  1567. * script_name is an ASCII string, the name of the script, e.g. "Latin"
  1568. * script_conf is confidence level in the script
  1569. * Returns true on success and writes values to each parameter as an output
  1570. */
  1571. bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf,
  1572. const char** script_name,
  1573. float* script_conf) {
  1574. OSResults osr;
  1575. bool osd = DetectOS(&osr);
  1576. if (!osd) {
  1577. return false;
  1578. }
  1579. int orient_id = osr.best_result.orientation_id;
  1580. int script_id = osr.get_best_script(orient_id);
  1581. if (orient_conf) *orient_conf = osr.best_result.oconfidence;
  1582. if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees
  1583. if (script_name) {
  1584. const char* script = osr.unicharset->get_script_from_script_id(script_id);
  1585. *script_name = script;
  1586. }
  1587. if (script_conf) *script_conf = osr.best_result.sconfidence;
  1588. return true;
  1589. }
  1590. /**
  1591. * The recognized text is returned as a char* which is coded
  1592. * as UTF8 and must be freed with the delete [] operator.
  1593. * page_number is a 0-based page index that will appear in the osd file.
  1594. */
  1595. char* TessBaseAPI::GetOsdText(int page_number) {
  1596. int orient_deg;
  1597. float orient_conf;
  1598. const char* script_name;
  1599. float script_conf;
  1600. if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name,
  1601. &script_conf))
  1602. return nullptr;
  1603. // clockwise rotation needed to make the page upright
  1604. int rotate = OrientationIdToValue(orient_deg / 90);
  1605. std::stringstream stream;
  1606. // Use "C" locale (needed for float values orient_conf and script_conf).
  1607. stream.imbue(std::locale::classic());
  1608. // Use fixed notation with 2 digits after the decimal point for float values.
  1609. stream.precision(2);
  1610. stream
  1611. << std::fixed
  1612. << "Page number: " << page_number << "\n"
  1613. << "Orientation in degrees: " << orient_deg << "\n"
  1614. << "Rotate: " << rotate << "\n"
  1615. << "Orientation confidence: " << orient_conf << "\n"
  1616. << "Script: " << script_name << "\n"
  1617. << "Script confidence: " << script_conf << "\n";
  1618. const std::string& text = stream.str();
  1619. char* result = new char[text.length() + 1];
  1620. strcpy(result, text.c_str());
  1621. return result;
  1622. }
  1623. #endif // ndef DISABLED_LEGACY_ENGINE
  1624. /** Returns the average word confidence for Tesseract page result. */
  1625. int TessBaseAPI::MeanTextConf() {
  1626. int* conf = AllWordConfidences();
  1627. if (!conf) return 0;
  1628. int sum = 0;
  1629. int *pt = conf;
  1630. while (*pt >= 0) sum += *pt++;
  1631. if (pt != conf) sum /= pt - conf;
  1632. delete [] conf;
  1633. return sum;
  1634. }
  1635. /** Returns an array of all word confidences, terminated by -1. */
  1636. int* TessBaseAPI::AllWordConfidences() {
  1637. if (tesseract_ == nullptr ||
  1638. (!recognition_done_ && Recognize(nullptr) < 0))
  1639. return nullptr;
  1640. int n_word = 0;
  1641. PAGE_RES_IT res_it(page_res_);
  1642. for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward())
  1643. n_word++;
  1644. int* conf = new int[n_word+1];
  1645. n_word = 0;
  1646. for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
  1647. WERD_RES *word = res_it.word();
  1648. WERD_CHOICE* choice = word->best_choice;
  1649. int w_conf = static_cast<int>(100 + 5 * choice->certainty());
  1650. // This is the eq for converting Tesseract confidence to 1..100
  1651. if (w_conf < 0) w_conf = 0;
  1652. if (w_conf > 100) w_conf = 100;
  1653. conf[n_word++] = w_conf;
  1654. }
  1655. conf[n_word] = -1;
  1656. return conf;
  1657. }
  1658. #ifndef DISABLED_LEGACY_ENGINE
  1659. /**
  1660. * Applies the given word to the adaptive classifier if possible.
  1661. * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
  1662. * tell the boundaries of the graphemes.
  1663. * Assumes that SetImage/SetRectangle have been used to set the image
  1664. * to the given word. The mode arg should be PSM_SINGLE_WORD or
  1665. * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
  1666. * The currently set PageSegMode is preserved.
  1667. * Returns false if adaption was not possible for some reason.
  1668. */
  1669. bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
  1670. int debug = 0;
  1671. GetIntVariable("applybox_debug", &debug);
  1672. bool success = true;
  1673. PageSegMode current_psm = GetPageSegMode();
  1674. SetPageSegMode(mode);
  1675. SetVariable("classify_enable_learning", "0");
  1676. const std::unique_ptr<const char[]> text(GetUTF8Text());
  1677. if (debug) {
  1678. tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
  1679. }
  1680. if (text != nullptr) {
  1681. PAGE_RES_IT it(page_res_);
  1682. WERD_RES* word_res = it.word();
  1683. if (word_res != nullptr) {
  1684. word_res->word->set_text(wordstr);
  1685. // Check to see if text matches wordstr.
  1686. int w = 0;
  1687. int t;
  1688. for (t = 0; text[t] != '\0'; ++t) {
  1689. if (text[t] == '\n' || text[t] == ' ')
  1690. continue;
  1691. while (wordstr[w] == ' ') ++w;
  1692. if (text[t] != wordstr[w])
  1693. break;
  1694. ++w;
  1695. }
  1696. if (text[t] != '\0' || wordstr[w] != '\0') {
  1697. // No match.
  1698. delete page_res_;
  1699. GenericVector<TBOX> boxes;
  1700. page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
  1701. tesseract_->ReSegmentByClassification(page_res_);
  1702. tesseract_->TidyUp(page_res_);
  1703. PAGE_RES_IT pr_it(page_res_);
  1704. if (pr_it.word() == nullptr)
  1705. success = false;
  1706. else
  1707. word_res = pr_it.word();
  1708. } else {
  1709. word_res->BestChoiceToCorrectText();
  1710. }
  1711. if (success) {
  1712. tesseract_->EnableLearning = true;
  1713. tesseract_->LearnWord(nullptr, word_res);
  1714. }
  1715. } else {
  1716. success = false;
  1717. }
  1718. } else {
  1719. success = false;
  1720. }
  1721. SetPageSegMode(current_psm);
  1722. return success;
  1723. }
  1724. #endif // ndef DISABLED_LEGACY_ENGINE
  1725. /**
  1726. * Free up recognition results and any stored image data, without actually
  1727. * freeing any recognition data that would be time-consuming to reload.
  1728. * Afterwards, you must call SetImage or TesseractRect before doing
  1729. * any Recognize or Get* operation.
  1730. */
  1731. void TessBaseAPI::Clear() {
  1732. if (thresholder_ != nullptr)
  1733. thresholder_->Clear();
  1734. ClearResults();
  1735. if (tesseract_ != nullptr) SetInputImage(nullptr);
  1736. }
  1737. /**
  1738. * Close down tesseract and free up all memory. End() is equivalent to
  1739. * destructing and reconstructing your TessBaseAPI.
  1740. * Once End() has been used, none of the other API functions may be used
  1741. * other than Init and anything declared above it in the class definition.
  1742. */
  1743. void TessBaseAPI::End() {
  1744. Clear();
  1745. delete thresholder_;
  1746. thresholder_ = nullptr;
  1747. delete page_res_;
  1748. page_res_ = nullptr;
  1749. delete block_list_;
  1750. block_list_ = nullptr;
  1751. if (paragraph_models_ != nullptr) {
  1752. paragraph_models_->delete_data_pointers();
  1753. delete paragraph_models_;
  1754. paragraph_models_ = nullptr;
  1755. }
  1756. if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
  1757. delete tesseract_;
  1758. tesseract_ = nullptr;
  1759. delete osd_tesseract_;
  1760. osd_tesseract_ = nullptr;
  1761. delete equ_detect_;
  1762. equ_detect_ = nullptr;
  1763. delete input_file_;
  1764. input_file_ = nullptr;
  1765. delete output_file_;
  1766. output_file_ = nullptr;
  1767. delete datapath_;
  1768. datapath_ = nullptr;
  1769. delete language_;
  1770. language_ = nullptr;
  1771. }
  1772. // Clear any library-level memory caches.
  1773. // There are a variety of expensive-to-load constant data structures (mostly
  1774. // language dictionaries) that are cached globally -- surviving the Init()
  1775. // and End() of individual TessBaseAPI's. This function allows the clearing
  1776. // of these caches.
  1777. void TessBaseAPI::ClearPersistentCache() {
  1778. Dict::GlobalDawgCache()->DeleteUnusedDawgs();
  1779. }
  1780. /**
  1781. * Check whether a word is valid according to Tesseract's language model
  1782. * returns 0 if the word is invalid, non-zero if valid
  1783. */
  1784. int TessBaseAPI::IsValidWord(const char *word) {
  1785. return tesseract_->getDict().valid_word(word);
  1786. }
  1787. // Returns true if utf8_character is defined in the UniCharset.
  1788. bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
  1789. return tesseract_->unicharset.contains_unichar(utf8_character);
  1790. }
  1791. // TODO(rays) Obsolete this function and replace with a more aptly named
  1792. // function that returns image coordinates rather than tesseract coordinates.
  1793. bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
  1794. PageIterator* it = AnalyseLayout();
  1795. if (it == nullptr) {
  1796. return false;
  1797. }
  1798. int x1, x2, y1, y2;
  1799. it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
  1800. // Calculate offset and slope (NOTE: Kind of ugly)
  1801. if (x2 <= x1) x2 = x1 + 1;
  1802. // Convert the point pair to slope/offset of the baseline (in image coords.)
  1803. *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
  1804. *out_offset = static_cast<int>(y1 - *out_slope * x1);
  1805. // Get the y-coord of the baseline at the left and right edges of the
  1806. // textline's bounding box.
  1807. int left, top, right, bottom;
  1808. if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
  1809. delete it;
  1810. return false;
  1811. }
  1812. int left_y = IntCastRounded(*out_slope * left + *out_offset);
  1813. int right_y = IntCastRounded(*out_slope * right + *out_offset);
  1814. // Shift the baseline down so it passes through the nearest bottom-corner
  1815. // of the textline's bounding box. This is the difference between the y
  1816. // at the lowest (max) edge of the box and the actual box bottom.
  1817. *out_offset += bottom - std::max(left_y, right_y);
  1818. // Switch back to bottom-up tesseract coordinates. Requires negation of
  1819. // the slope and height - offset for the offset.
  1820. *out_slope = -*out_slope;
  1821. *out_offset = rect_height_ - *out_offset;
  1822. delete it;
  1823. return true;
  1824. }
  1825. /** Sets Dict::letter_is_okay_ function to point to the given function. */
  1826. void TessBaseAPI::SetDictFunc(DictFunc f) {
  1827. if (tesseract_ != nullptr) {
  1828. tesseract_->getDict().letter_is_okay_ = f;
  1829. }
  1830. }
  1831. /**
  1832. * Sets Dict::probability_in_context_ function to point to the given
  1833. * function.
  1834. *
  1835. * @param f A single function that returns the probability of the current
  1836. * "character" (in general a utf-8 string), given the context of a previous
  1837. * utf-8 string.
  1838. */
  1839. void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
  1840. if (tesseract_ != nullptr) {
  1841. tesseract_->getDict().probability_in_context_ = f;
  1842. // Set it for the sublangs too.
  1843. int num_subs = tesseract_->num_sub_langs();
  1844. for (int i = 0; i < num_subs; ++i) {
  1845. tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
  1846. }
  1847. }
  1848. }
  1849. #ifndef DISABLED_LEGACY_ENGINE
  1850. /** Sets Wordrec::fill_lattice_ function to point to the given function. */
  1851. void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) {
  1852. if (tesseract_ != nullptr) tesseract_->fill_lattice_ = f;
  1853. }
  1854. #endif // ndef DISABLED_LEGACY_ENGINE
  1855. /** Common code for setting the image. */
  1856. bool TessBaseAPI::InternalSetImage() {
  1857. if (tesseract_ == nullptr) {
  1858. tprintf("Please call Init before attempting to set an image.\n");
  1859. return false;
  1860. }
  1861. if (thresholder_ == nullptr)
  1862. thresholder_ = new ImageThresholder;
  1863. ClearResults();
  1864. return true;
  1865. }
  1866. /**
  1867. * Run the thresholder to make the thresholded image, returned in pix,
  1868. * which must not be nullptr. *pix must be initialized to nullptr, or point
  1869. * to an existing pixDestroyable Pix.
  1870. * The usual argument to Threshold is Tesseract::mutable_pix_binary().
  1871. */
  1872. bool TessBaseAPI::Threshold(Pix** pix) {
  1873. ASSERT_HOST(pix != nullptr);
  1874. if (*pix != nullptr)
  1875. pixDestroy(pix);
  1876. // Zero resolution messes up the algorithms, so make sure it is credible.
  1877. int user_dpi = 0;
  1878. GetIntVariable("user_defined_dpi", &user_dpi);
  1879. int y_res = thresholder_->GetScaledYResolution();
  1880. if (user_dpi && (user_dpi < kMinCredibleResolution ||
  1881. user_dpi > kMaxCredibleResolution)) {
  1882. tprintf("Warning: User defined image dpi is outside of expected range "
  1883. "(%d - %d)!\n",
  1884. kMinCredibleResolution, kMaxCredibleResolution);
  1885. }
  1886. // Always use user defined dpi
  1887. if (user_dpi) {
  1888. thresholder_->SetSourceYResolution(user_dpi);
  1889. } else if (y_res < kMinCredibleResolution ||
  1890. y_res > kMaxCredibleResolution) {
  1891. tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
  1892. y_res, kMinCredibleResolution);
  1893. thresholder_->SetSourceYResolution(kMinCredibleResolution);
  1894. }
  1895. auto pageseg_mode =
  1896. static_cast<PageSegMode>(
  1897. static_cast<int>(tesseract_->tessedit_pageseg_mode));
  1898. if (!thresholder_->ThresholdToPix(pageseg_mode, pix)) return false;
  1899. thresholder_->GetImageSizes(&rect_left_, &rect_top_,
  1900. &rect_width_, &rect_height_,
  1901. &image_width_, &image_height_);
  1902. if (!thresholder_->IsBinary()) {
  1903. tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
  1904. tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
  1905. } else {
  1906. tesseract_->set_pix_thresholds(nullptr);
  1907. tesseract_->set_pix_grey(nullptr);
  1908. }
  1909. // Set the internal resolution that is used for layout parameters from the
  1910. // estimated resolution, rather than the image resolution, which may be
  1911. // fabricated, but we will use the image resolution, if there is one, to
  1912. // report output point sizes.
  1913. int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
  1914. kMinCredibleResolution,
  1915. kMaxCredibleResolution);
  1916. if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
  1917. tprintf("Estimated internal resolution %d out of range! "
  1918. "Corrected to %d.\n",
  1919. thresholder_->GetScaledEstimatedResolution(), estimated_res);
  1920. }
  1921. tesseract_->set_source_resolution(estimated_res);
  1922. return true;
  1923. }
  1924. /** Find lines from the image making the BLOCK_LIST. */
  1925. int TessBaseAPI::FindLines() {
  1926. if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
  1927. tprintf("Please call SetImage before attempting recognition.\n");
  1928. return -1;
  1929. }
  1930. if (recognition_done_)
  1931. ClearResults();
  1932. if (!block_list_->empty()) {
  1933. return 0;
  1934. }
  1935. if (tesseract_ == nullptr) {
  1936. tesseract_ = new Tesseract;
  1937. #ifndef DISABLED_LEGACY_ENGINE
  1938. tesseract_->InitAdaptiveClassifier(nullptr);
  1939. #endif
  1940. }
  1941. if (tesseract_->pix_binary() == nullptr &&
  1942. !Threshold(tesseract_->mutable_pix_binary())) {
  1943. return -1;
  1944. }
  1945. tesseract_->PrepareForPageseg();
  1946. #ifndef DISABLED_LEGACY_ENGINE
  1947. if (tesseract_->textord_equation_detect) {
  1948. if (equ_detect_ == nullptr && datapath_ != nullptr) {
  1949. equ_detect_ = new EquationDetect(datapath_->c_str(), nullptr);
  1950. }
  1951. if (equ_detect_ == nullptr) {
  1952. tprintf("Warning: Could not set equation detector\n");
  1953. } else {
  1954. tesseract_->SetEquationDetect(equ_detect_);
  1955. }
  1956. }
  1957. #endif // ndef DISABLED_LEGACY_ENGINE
  1958. Tesseract* osd_tess = osd_tesseract_;
  1959. OSResults osr;
  1960. if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) &&
  1961. osd_tess == nullptr) {
  1962. if (strcmp(language_->c_str(), "osd") == 0) {
  1963. osd_tess = tesseract_;
  1964. } else {
  1965. osd_tesseract_ = new Tesseract;
  1966. TessdataManager mgr(reader_);
  1967. if (datapath_ == nullptr) {
  1968. tprintf("Warning: Auto orientation and script detection requested,"
  1969. " but data path is undefined\n");
  1970. delete osd_tesseract_;
  1971. osd_tesseract_ = nullptr;
  1972. } else if (osd_tesseract_->init_tesseract(datapath_->c_str(), nullptr,
  1973. "osd", OEM_TESSERACT_ONLY,
  1974. nullptr, 0, nullptr, nullptr,
  1975. false, &mgr) == 0) {
  1976. osd_tess = osd_tesseract_;
  1977. osd_tesseract_->set_source_resolution(
  1978. thresholder_->GetSourceYResolution());
  1979. } else {
  1980. tprintf("Warning: Auto orientation and script detection requested,"
  1981. " but osd language failed to load\n");
  1982. delete osd_tesseract_;
  1983. osd_tesseract_ = nullptr;
  1984. }
  1985. }
  1986. }
  1987. if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
  1988. return -1;
  1989. // If Devanagari is being recognized, we use different images for page seg
  1990. // and for OCR.
  1991. tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
  1992. return 0;
  1993. }
  1994. /** Delete the pageres and clear the block list ready for a new page. */
  1995. void TessBaseAPI::ClearResults() {
  1996. if (tesseract_ != nullptr) {
  1997. tesseract_->Clear();
  1998. }
  1999. delete page_res_;
  2000. page_res_ = nullptr;
  2001. recognition_done_ = false;
  2002. if (block_list_ == nullptr)
  2003. block_list_ = new BLOCK_LIST;
  2004. else
  2005. block_list_->clear();
  2006. if (paragraph_models_ != nullptr) {
  2007. paragraph_models_->delete_data_pointers();
  2008. delete paragraph_models_;
  2009. paragraph_models_ = nullptr;
  2010. }
  2011. }
  2012. /**
  2013. * Return the length of the output text string, as UTF8, assuming
  2014. * liberally two spacing marks after each word (as paragraphs end with two
  2015. * newlines), and assuming a single character reject marker for each rejected
  2016. * character.
  2017. * Also return the number of recognized blobs in blob_count.
  2018. */
  2019. int TessBaseAPI::TextLength(int* blob_count) {
  2020. if (tesseract_ == nullptr || page_res_ == nullptr)
  2021. return 0;
  2022. PAGE_RES_IT page_res_it(page_res_);
  2023. int total_length = 2;
  2024. int total_blobs = 0;
  2025. // Iterate over the data structures to extract the recognition result.
  2026. for (page_res_it.restart_page(); page_res_it.word () != nullptr;
  2027. page_res_it.forward()) {
  2028. WERD_RES *word = page_res_it.word();
  2029. WERD_CHOICE* choice = word->best_choice;
  2030. if (choice != nullptr) {
  2031. total_blobs += choice->length() + 2;
  2032. total_length += choice->unichar_string().length() + 2;
  2033. for (int i = 0; i < word->reject_map.length(); ++i) {
  2034. if (word->reject_map[i].rejected())
  2035. ++total_length;
  2036. }
  2037. }
  2038. }
  2039. if (blob_count != nullptr)
  2040. *blob_count = total_blobs;
  2041. return total_length;
  2042. }
  2043. #ifndef DISABLED_LEGACY_ENGINE
  2044. /**
  2045. * Estimates the Orientation And Script of the image.
  2046. * Returns true if the image was processed successfully.
  2047. */
  2048. bool TessBaseAPI::DetectOS(OSResults* osr) {
  2049. if (tesseract_ == nullptr)
  2050. return false;
  2051. ClearResults();
  2052. if (tesseract_->pix_binary() == nullptr &&
  2053. !Threshold(tesseract_->mutable_pix_binary())) {
  2054. return false;
  2055. }
  2056. if (input_file_ == nullptr)
  2057. input_file_ = new STRING(kInputFile);
  2058. return orientation_and_script_detection(*input_file_, osr, tesseract_) > 0;
  2059. }
  2060. #endif // ndef DISABLED_LEGACY_ENGINE
  2061. void TessBaseAPI::set_min_orientation_margin(double margin) {
  2062. tesseract_->min_orientation_margin.set_value(margin);
  2063. }
  2064. /**
  2065. * Return text orientation of each block as determined in an earlier page layout
  2066. * analysis operation. Orientation is returned as the number of ccw 90-degree
  2067. * rotations (in [0..3]) required to make the text in the block upright
  2068. * (readable). Note that this may not necessary be the block orientation
  2069. * preferred for recognition (such as the case of vertical CJK text).
  2070. *
  2071. * Also returns whether the text in the block is believed to have vertical
  2072. * writing direction (when in an upright page orientation).
  2073. *
  2074. * The returned array is of length equal to the number of text blocks, which may
  2075. * be less than the total number of blocks. The ordering is intended to be
  2076. * consistent with GetTextLines().
  2077. */
  2078. void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
  2079. bool** vertical_writing) {
  2080. delete[] *block_orientation;
  2081. *block_orientation = nullptr;
  2082. delete[] *vertical_writing;
  2083. *vertical_writing = nullptr;
  2084. BLOCK_IT block_it(block_list_);
  2085. block_it.move_to_first();
  2086. int num_blocks = 0;
  2087. for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
  2088. if (!block_it.data()->pdblk.poly_block()->IsText()) {
  2089. continue;
  2090. }
  2091. ++num_blocks;
  2092. }
  2093. if (!num_blocks) {
  2094. tprintf("WARNING: Found no blocks\n");
  2095. return;
  2096. }
  2097. *block_orientation = new int[num_blocks];
  2098. *vertical_writing = new bool[num_blocks];
  2099. block_it.move_to_first();
  2100. int i = 0;
  2101. for (block_it.mark_cycle_pt(); !block_it.cycled_list();
  2102. block_it.forward()) {
  2103. if (!block_it.data()->pdblk.poly_block()->IsText()) {
  2104. continue;
  2105. }
  2106. FCOORD re_rotation = block_it.data()->re_rotation();
  2107. float re_theta = re_rotation.angle();
  2108. FCOORD classify_rotation = block_it.data()->classify_rotation();
  2109. float classify_theta = classify_rotation.angle();
  2110. double rot_theta = - (re_theta - classify_theta) * 2.0 / M_PI;
  2111. if (rot_theta < 0) rot_theta += 4;
  2112. int num_rotations = static_cast<int>(rot_theta + 0.5);
  2113. (*block_orientation)[i] = num_rotations;
  2114. // The classify_rotation is non-zero only if the text has vertical
  2115. // writing direction.
  2116. (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
  2117. ++i;
  2118. }
  2119. }
  2120. void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
  2121. int debug_level = 0;
  2122. GetIntVariable("paragraph_debug_level", &debug_level);
  2123. if (paragraph_models_ == nullptr)
  2124. paragraph_models_ = new GenericVector<ParagraphModel*>;
  2125. MutableIterator *result_it = GetMutableIterator();
  2126. do { // Detect paragraphs for this block
  2127. GenericVector<ParagraphModel *> models;
  2128. ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
  2129. result_it, &models);
  2130. *paragraph_models_ += models;
  2131. } while (result_it->Next(RIL_BLOCK));
  2132. delete result_it;
  2133. }
  2134. /** This method returns the string form of the specified unichar. */
  2135. const char* TessBaseAPI::GetUnichar(int unichar_id) {
  2136. return tesseract_->unicharset.id_to_unichar(unichar_id);
  2137. }
  2138. /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
  2139. const Dawg *TessBaseAPI::GetDawg(int i) const {
  2140. if (tesseract_ == nullptr || i >= NumDawgs()) return nullptr;
  2141. return tesseract_->getDict().GetDawg(i);
  2142. }
  2143. /** Return the number of dawgs loaded into tesseract_ object. */
  2144. int TessBaseAPI::NumDawgs() const {
  2145. return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
  2146. }
  2147. /** Escape a char string - remove <>&"' with HTML codes. */
  2148. STRING HOcrEscape(const char* text) {
  2149. STRING ret;
  2150. const char *ptr;
  2151. for (ptr = text; *ptr; ptr++) {
  2152. switch (*ptr) {
  2153. case '<': ret += "&lt;"; break;
  2154. case '>': ret += "&gt;"; break;
  2155. case '&': ret += "&amp;"; break;
  2156. case '"': ret += "&quot;"; break;
  2157. case '\'': ret += "&#39;"; break;
  2158. default: ret += *ptr;
  2159. }
  2160. }
  2161. return ret;
  2162. }
  2163. #ifndef DISABLED_LEGACY_ENGINE
  2164. // ____________________________________________________________________________
  2165. // Ocropus add-ons.
  2166. /** Find lines from the image making the BLOCK_LIST. */
  2167. BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
  2168. ASSERT_HOST(FindLines() == 0);
  2169. BLOCK_LIST* result = block_list_;
  2170. block_list_ = nullptr;
  2171. return result;
  2172. }
  2173. /**
  2174. * Delete a block list.
  2175. * This is to keep BLOCK_LIST pointer opaque
  2176. * and let go of including the other headers.
  2177. */
  2178. void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
  2179. delete block_list;
  2180. }
  2181. ROW *TessBaseAPI::MakeTessOCRRow(float baseline,
  2182. float xheight,
  2183. float descender,
  2184. float ascender) {
  2185. int32_t xstarts[] = {-32000};
  2186. double quad_coeffs[] = {0, 0, baseline};
  2187. return new ROW(1,
  2188. xstarts,
  2189. quad_coeffs,
  2190. xheight,
  2191. ascender - (baseline + xheight),
  2192. descender - baseline,
  2193. 0,
  2194. 0);
  2195. }
  2196. /** Creates a TBLOB* from the whole pix. */
  2197. TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) {
  2198. int width = pixGetWidth(pix);
  2199. int height = pixGetHeight(pix);
  2200. BLOCK block("a character", true, 0, 0, 0, 0, width, height);
  2201. // Create C_BLOBs from the page
  2202. extract_edges(pix, &block);
  2203. // Merge all C_BLOBs
  2204. C_BLOB_LIST *list = block.blob_list();
  2205. C_BLOB_IT c_blob_it(list);
  2206. if (c_blob_it.empty())
  2207. return nullptr;
  2208. // Move all the outlines to the first blob.
  2209. C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
  2210. for (c_blob_it.forward();
  2211. !c_blob_it.at_first();
  2212. c_blob_it.forward()) {
  2213. C_BLOB *c_blob = c_blob_it.data();
  2214. ol_it.add_list_after(c_blob->out_list());
  2215. }
  2216. // Convert the first blob to the output TBLOB.
  2217. return TBLOB::PolygonalCopy(false, c_blob_it.data());
  2218. }
  2219. /**
  2220. * This method baseline normalizes a TBLOB in-place. The input row is used
  2221. * for normalization. The denorm is an optional parameter in which the
  2222. * normalization-antidote is returned.
  2223. */
  2224. void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
  2225. TBOX box = tblob->bounding_box();
  2226. float x_center = (box.left() + box.right()) / 2.0f;
  2227. float baseline = row->base_line(x_center);
  2228. float scale = kBlnXHeight / row->x_height();
  2229. tblob->Normalize(nullptr, nullptr, nullptr, x_center, baseline, scale, scale,
  2230. 0.0f, static_cast<float>(kBlnBaselineOffset), false, nullptr);
  2231. }
  2232. /**
  2233. * Return a TBLOB * from the whole pix.
  2234. * To be freed later with delete.
  2235. */
  2236. static TBLOB *make_tesseract_blob(float baseline, float xheight,
  2237. float descender, float ascender,
  2238. bool numeric_mode, Pix* pix) {
  2239. TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
  2240. // Normalize TBLOB
  2241. ROW *row =
  2242. TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
  2243. TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
  2244. delete row;
  2245. return tblob;
  2246. }
  2247. /**
  2248. * Adapt to recognize the current image as the given character.
  2249. * The image must be preloaded into pix_binary_ and be just an image
  2250. * of a single character.
  2251. */
  2252. void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
  2253. int length,
  2254. float baseline,
  2255. float xheight,
  2256. float descender,
  2257. float ascender) {
  2258. UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
  2259. TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
  2260. tesseract_->classify_bln_numeric_mode,
  2261. tesseract_->pix_binary());
  2262. float threshold;
  2263. float best_rating = -100;
  2264. // Classify to get a raw choice.
  2265. BLOB_CHOICE_LIST choices;
  2266. tesseract_->AdaptiveClassifier(blob, &choices);
  2267. BLOB_CHOICE_IT choice_it;
  2268. choice_it.set_to_list(&choices);
  2269. for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
  2270. choice_it.forward()) {
  2271. if (choice_it.data()->rating() > best_rating) {
  2272. best_rating = choice_it.data()->rating();
  2273. }
  2274. }
  2275. threshold = tesseract_->matcher_good_threshold;
  2276. if (blob->outlines)
  2277. tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold,
  2278. tesseract_->AdaptedTemplates);
  2279. delete blob;
  2280. }
  2281. PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
  2282. auto *page_res = new PAGE_RES(false, block_list,
  2283. &(tesseract_->prev_word_best_choice_));
  2284. tesseract_->recog_all_words(page_res, nullptr, nullptr, nullptr, 1);
  2285. return page_res;
  2286. }
  2287. PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
  2288. PAGE_RES* pass1_result) {
  2289. if (!pass1_result)
  2290. pass1_result = new PAGE_RES(false, block_list,
  2291. &(tesseract_->prev_word_best_choice_));
  2292. tesseract_->recog_all_words(pass1_result, nullptr, nullptr, nullptr, 2);
  2293. return pass1_result;
  2294. }
  2295. struct TESS_CHAR : ELIST_LINK {
  2296. char *unicode_repr;
  2297. int length; // of unicode_repr
  2298. float cost;
  2299. TBOX box;
  2300. TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
  2301. length = (len == -1 ? strlen(repr) : len);
  2302. unicode_repr = new char[length + 1];
  2303. strncpy(unicode_repr, repr, length);
  2304. }
  2305. TESS_CHAR()
  2306. : unicode_repr(nullptr),
  2307. length(0),
  2308. cost(0.0f)
  2309. { // Satisfies ELISTIZE.
  2310. }
  2311. ~TESS_CHAR() {
  2312. delete [] unicode_repr;
  2313. }
  2314. };
  2315. ELISTIZEH(TESS_CHAR)
  2316. ELISTIZE(TESS_CHAR)
  2317. static void add_space(TESS_CHAR_IT* it) {
  2318. auto *t = new TESS_CHAR(0, " ");
  2319. it->add_after_then_move(t);
  2320. }
  2321. static float rating_to_cost(float rating) {
  2322. rating = 100 + rating;
  2323. // cuddled that to save from coverage profiler
  2324. // (I have never seen ratings worse than -100,
  2325. // but the check won't hurt)
  2326. if (rating < 0) rating = 0;
  2327. return rating;
  2328. }
  2329. /**
  2330. * Extract the OCR results, costs (penalty points for uncertainty),
  2331. * and the bounding boxes of the characters.
  2332. */
  2333. static void extract_result(TESS_CHAR_IT* out,
  2334. PAGE_RES* page_res) {
  2335. PAGE_RES_IT page_res_it(page_res);
  2336. int word_count = 0;
  2337. while (page_res_it.word() != nullptr) {
  2338. WERD_RES *word = page_res_it.word();
  2339. const char *str = word->best_choice->unichar_string().c_str();
  2340. const char *len = word->best_choice->unichar_lengths().c_str();
  2341. TBOX real_rect = word->word->bounding_box();
  2342. if (word_count)
  2343. add_space(out);
  2344. int n = strlen(len);
  2345. for (int i = 0; i < n; i++) {
  2346. auto *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
  2347. str, *len);
  2348. tc->box = real_rect.intersection(word->box_word->BlobBox(i));
  2349. out->add_after_then_move(tc);
  2350. str += *len;
  2351. len++;
  2352. }
  2353. page_res_it.forward();
  2354. word_count++;
  2355. }
  2356. }
  2357. /**
  2358. * Extract the OCR results, costs (penalty points for uncertainty),
  2359. * and the bounding boxes of the characters.
  2360. */
  2361. int TessBaseAPI::TesseractExtractResult(char** text,
  2362. int** lengths,
  2363. float** costs,
  2364. int** x0,
  2365. int** y0,
  2366. int** x1,
  2367. int** y1,
  2368. PAGE_RES* page_res) {
  2369. TESS_CHAR_LIST tess_chars;
  2370. TESS_CHAR_IT tess_chars_it(&tess_chars);
  2371. extract_result(&tess_chars_it, page_res);
  2372. tess_chars_it.move_to_first();
  2373. int n = tess_chars.length();
  2374. int text_len = 0;
  2375. *lengths = new int[n];
  2376. *costs = new float[n];
  2377. *x0 = new int[n];
  2378. *y0 = new int[n];
  2379. *x1 = new int[n];
  2380. *y1 = new int[n];
  2381. int i = 0;
  2382. for (tess_chars_it.mark_cycle_pt();
  2383. !tess_chars_it.cycled_list();
  2384. tess_chars_it.forward(), i++) {
  2385. TESS_CHAR *tc = tess_chars_it.data();
  2386. text_len += (*lengths)[i] = tc->length;
  2387. (*costs)[i] = tc->cost;
  2388. (*x0)[i] = tc->box.left();
  2389. (*y0)[i] = tc->box.bottom();
  2390. (*x1)[i] = tc->box.right();
  2391. (*y1)[i] = tc->box.top();
  2392. }
  2393. char *p = *text = new char[text_len];
  2394. tess_chars_it.move_to_first();
  2395. for (tess_chars_it.mark_cycle_pt();
  2396. !tess_chars_it.cycled_list();
  2397. tess_chars_it.forward()) {
  2398. TESS_CHAR *tc = tess_chars_it.data();
  2399. strncpy(p, tc->unicode_repr, tc->length);
  2400. p += tc->length;
  2401. }
  2402. return n;
  2403. }
  2404. /** This method returns the features associated with the input blob. */
  2405. // The resulting features are returned in int_features, which must be
  2406. // of size MAX_NUM_INT_FEATURES. The number of features is returned in
  2407. // num_features (or 0 if there was a failure).
  2408. // On return feature_outline_index is filled with an index of the outline
  2409. // corresponding to each feature in int_features.
  2410. // TODO(rays) Fix the caller to out outline_counts instead.
  2411. void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob,
  2412. INT_FEATURE_STRUCT* int_features,
  2413. int* num_features,
  2414. int* feature_outline_index) {
  2415. GenericVector<int> outline_counts;
  2416. GenericVector<INT_FEATURE_STRUCT> bl_features;
  2417. GenericVector<INT_FEATURE_STRUCT> cn_features;
  2418. INT_FX_RESULT_STRUCT fx_info;
  2419. tesseract_->ExtractFeatures(*blob, false, &bl_features,
  2420. &cn_features, &fx_info, &outline_counts);
  2421. if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) {
  2422. *num_features = 0;
  2423. return; // Feature extraction failed.
  2424. }
  2425. *num_features = cn_features.size();
  2426. memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
  2427. // TODO(rays) Pass outline_counts back and simplify the calling code.
  2428. if (feature_outline_index != nullptr) {
  2429. int f = 0;
  2430. for (int i = 0; i < outline_counts.size(); ++i) {
  2431. while (f < outline_counts[i])
  2432. feature_outline_index[f++] = i;
  2433. }
  2434. }
  2435. }
  2436. // This method returns the row to which a box of specified dimensions would
  2437. // belong. If no good match is found, it returns nullptr.
  2438. ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
  2439. int left, int top, int right, int bottom) {
  2440. TBOX box(left, bottom, right, top);
  2441. BLOCK_IT b_it(blocks);
  2442. for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
  2443. BLOCK* block = b_it.data();
  2444. if (!box.major_overlap(block->pdblk.bounding_box()))
  2445. continue;
  2446. ROW_IT r_it(block->row_list());
  2447. for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
  2448. ROW* row = r_it.data();
  2449. if (!box.major_overlap(row->bounding_box()))
  2450. continue;
  2451. WERD_IT w_it(row->word_list());
  2452. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
  2453. WERD* word = w_it.data();
  2454. if (box.major_overlap(word->bounding_box()))
  2455. return row;
  2456. }
  2457. }
  2458. }
  2459. return nullptr;
  2460. }
  2461. /** Method to run adaptive classifier on a blob. */
  2462. void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob,
  2463. int num_max_matches,
  2464. int* unichar_ids,
  2465. float* ratings,
  2466. int* num_matches_returned) {
  2467. auto* choices = new BLOB_CHOICE_LIST;
  2468. tesseract_->AdaptiveClassifier(blob, choices);
  2469. BLOB_CHOICE_IT choices_it(choices);
  2470. int& index = *num_matches_returned;
  2471. index = 0;
  2472. for (choices_it.mark_cycle_pt();
  2473. !choices_it.cycled_list() && index < num_max_matches;
  2474. choices_it.forward()) {
  2475. BLOB_CHOICE* choice = choices_it.data();
  2476. unichar_ids[index] = choice->unichar_id();
  2477. ratings[index] = choice->rating();
  2478. ++index;
  2479. }
  2480. *num_matches_returned = index;
  2481. delete choices;
  2482. }
  2483. #endif // ndef DISABLED_LEGACY_ENGINE
  2484. } // namespace tesseract.
Tip!

Press p or to see the previous file or, n or to see the next file