All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TextExtractor.h
Go to the documentation of this file.
1 //---------------------------------------------------------------------------------------
2 // Copyright (c) 2001-2023 by Apryse Software Inc. All Rights Reserved.
3 // Consult legal.txt regarding legal and license information.
4 //---------------------------------------------------------------------------------------
5 #ifndef PDFTRON_H_CPPPDFTextExtractor
6 #define PDFTRON_H_CPPPDFTextExtractor
7 
8 #include <PDF/Page.h>
9 #include <PDF/Rect.h>
10 #include <Common/UString.h>
11 #include <C/PDF/TRN_TextExtractor.h>
12 #include <vector>
13 #include <PDF/Highlights.h>
14 #include<PDF/OCG/Context.h>
15 
16 namespace pdftron {
17  namespace PDF {
18 
19 class Style;
20 class Word;
21 class Line;
22 
26 struct CharRange
27 {
28  int index; // character index
29  int length; // character length
30 };
31 
117 {
118 public:
122 
126  TextExtractor();
127  ~TextExtractor();
128 
134  {
135  // Disables expanding of ligatures using a predefined mapping.
136  // Default ligatures are: fi, ff, fl, ffi, ffl, ch, cl, ct, ll,
137  // ss, fs, st, oe, OE.
139 
140  // Disables removing duplicated text that is frequently used to
141  // achieve visual effects of drop shadow and fake bold.
143 
144  // Treat punctuation (e.g. full stop, comma, semicolon, etc.) as
145  // word break characters.
147 
148  // Enables removal of text that is obscured by images or
149  // rectangles. Since this option has small performance penalty
150  // on performance of text extraction, by default it is not
151  // enabled.
153 
154  // Enables removing text that uses rendering mode 3 (i.e. invisible text).
155  // Invisible text is usually used in 'PDF Searchable Images' (i.e. scanned
156  // pages with a corresponding OCR text). As a result, invisible text
157  // will be extracted by default.
159 
160  // Enables removal of text that is marked as part of a Watermark layer
162 
163  // Use Z-order as reading order for text
165  };
166 
176  void Begin(Page page, const Rect* clip_ptr = 0, UInt32 flags = 0);
177 
188  void SetOCGContext(OCG::Context* ctx);
189 
193  int GetWordCount();
194 
200  void SetRightToLeftLanguage(bool rtl);
205  bool GetRightToLeftLanguage();
219  UString GetAsText(bool dehyphen = true);
220 
221 #ifndef SWIG
222  void GetAsText(UString& out_str, bool dehyphen = true);
223 #endif
224 
230  UString GetTextUnderAnnot(const Annot& annot);
231 
232 #ifndef SWIG
233  void GetTextUnderAnnot(UString& out_str, const Annot& annot);
234 #endif
235 
236 
241  {
242  // Output words as XML elements instead of inline text.
244 
245  // Include bounding box information for each XML element.
246  // The bounding box information will be stored as 'bbox' attribute.
248 
249  // Include font and styling information.
251  };
252 
295  UString GetAsXML(UInt32 xml_output_flags = 0);
296 
297 #ifndef SWIG
298  void GetAsXML(UString& out_xml, UInt32 xml_output_flags = 0);
299 #endif
300 
306  Highlights GetHighlights(const std::vector<CharRange>& char_ranges);
307 
308 #ifndef SWIG
309 
315  Highlights GetHighlights(const CharRange* char_ranges, size_t char_ranges_count);
316 #endif
317 
321  int GetNumLines();
322 
323 
330  Line GetFirstLine();
331 
335  void Destroy();
336 
337  // @cond PRIVATE_DOC
338 private:
339  TRN_TextExtractor mp_extractor;
340 
341  // TextExtractor should not be copied
342  TextExtractor(const TextExtractor& other);
343  TextExtractor& operator= (const TextExtractor&);
344  // @endcond
345 };
346 
352 class Style
353 {
354 public:
355 
362  SDF::Obj GetFont();
363 
368 
377  double GetFontSize();
378 
387  int GetWeight();
388 
393  bool IsItalic();
394 
400  bool IsSerif();
401 
405  std::vector<int> GetColor();
406 
407 #ifndef SWIG
408  void GetColor(UInt8 rgb[3]);
409 #endif
410 
411  bool operator== (const Style& s) const;
412  bool operator!= (const Style& s) const;
413 
414  Style();
415 
416  // @cond PRIVATE_DOC
417  #ifndef SWIGHIDDEN
418  Style(const Style& s);
419  Style(TRN_TextExtractorStyle impl);
420  TRN_TextExtractorStyle mp_style;
421  #endif
422  // @endcond
423 };
424 
430 class Word
431 {
432 public:
436  int GetNumGlyphs();
437 
444  Rect GetBBox();
445 
446 #ifndef SWIG
447  void GetBBox(double out_bbox[4]);
448 #endif
449 
454  std::vector<double> GetQuad();
455 
456 #ifndef SWIG
457  void GetQuad(double out_quad[8]);
458 #endif
459 
465  std::vector<double> GetGlyphQuad(int glyph_idx);
466 
467 #ifndef SWIG
468  void GetGlyphQuad(int glyph_idx, double out_quad[8]);
469 #endif
470 
475  Style GetCharStyle(int char_idx);
476 
480  Style GetStyle();
481 
485  int GetStringLen();
486 
490 #ifdef SWIG
491  UString GetString();
492 #else
493  const Unicode* GetString();
494 #endif
495 
499  Word GetNextWord();
500 
506  int GetCurrentNum();
507 
511  bool IsValid();
512 
513  bool operator== (const Word&) const;
514  bool operator!= (const Word&) const;
515  Word();
516 
517  // @cond PRIVATE_DOC
518  #ifndef SWIGHIDDEN
519  Word(TRN_TextExtractorWord impl);
520  TRN_TextExtractorWord mp_word;
521  #endif
522  // @endcond
523 };
524 
530 class Line {
531 public:
532 
536  int GetNumWords();
537 
542  bool IsSimpleLine();
543 
550 #ifdef SWIG
551  Rect GetBBox();
552 #else
553  const double* GetBBox();
554 #endif
555 
560  std::vector<double> GetQuad();
561 
562 #ifndef SWIG
563 
567  void GetQuad(double out_quad[8]);
568 #endif
569 
574  Word GetFirstWord();
575 
580  Word GetWord(int word_idx);
581 
585  Line GetNextLine();
586 
590  int GetCurrentNum();
591 
595  Style GetStyle();
596 
602  int GetParagraphID();
603 
609  int GetFlowID();
610 
615  bool EndsWithHyphen();
616 
620  bool IsValid();
621 
622  bool operator== (const Line&) const;
623  bool operator!= (const Line&) const;
624  Line();
625 
626  // @cond PRIVATE_DOC
627  #ifndef SWIGHIDDEN
628  Line(TRN_TextExtractorLine impl);
629  TRN_TextExtractorLine mp_line;
630  #endif
631  // @endcond
632 };
633 
634 
635 
636 
637 #include <Impl/TextExtractor.inl>
638 
639  }; // namespace PDF
640 }; // namespace pdftron
641 
642 #endif // PDFTRON_H_CPPPDFTextExtractor
bool operator!=(const Line &) const
bool operator==(const Line &) const
UString GetAsText(bool dehyphen=true)
Highlights GetHighlights(const std::vector< CharRange > &char_ranges)
std::vector< double > GetGlyphQuad(int glyph_idx)
Style GetCharStyle(int char_idx)
const double * GetBBox()
TRN_UInt8 UInt8
Definition: BasicTypes.h:15
void SetOCGContext(OCG::Context *ctx)
pdftron::PDF::Line Line
std::vector< double > GetQuad()
std::vector< double > GetQuad()
pdftron::PDF::Style Style
TRN_Unicode Unicode
Definition: BasicTypes.h:22
TRN_UInt32 UInt32
Definition: BasicTypes.h:13
UString GetTextUnderAnnot(const Annot &annot)
bool operator==(const Style &s) const
bool operator!=(const Word &) const
const Unicode * GetString()
void Begin(Page page, const Rect *clip_ptr=0, UInt32 flags=0)
UString GetAsXML(UInt32 xml_output_flags=0)
std::vector< int > GetColor()
bool operator!=(const Style &s) const
pdftron::PDF::Word Word
void SetRightToLeftLanguage(bool rtl)
Word GetWord(int word_idx)
bool operator==(const Word &) const