Is it possible to convert Untagged PDF to Tagged PDF?

751 views
Skip to first unread message

Support

unread,
Sep 13, 2011, 7:39:43 PM9/13/11
to pdfne...@googlegroups.com
Q: Is it possible to convert Untagged PDF to Tagged PDF with PDFTRON PDFNet SDK?
 
---------
 
A:  You can use PDFNet SDK to implement PDF tagging, however there is no single function to automatically generate a tagged PDF.

 

As a starting point you may want to take a look at LogicalStructure sample project:

   http://www.pdftron.com/pdfnet/samplecode.html#LogicalStructure

 

Attached is another example of how to tag a PDF. This example is a bit more involved and is using C++.  If you need to automatically tag PDFs (as opposed to manual tagging) you could use ‘pdftron.PDF.TextExtractor’ and ‘pdftron.PDF.ElementReader’ as a starting point for the analysis.

 

// PDFTagTest.cpp  ...

 

#include <PDF/PDFNet.h>
#include <PDF/PDFDoc.h>
#include <PDF/ElementReader.h>
#include <PDF/Struct/SElement.h>
#include <PDF/Struct/ContentItem.h>
#include <SDF/NumberTree.h>

#include <iostream>
#include <map>

using namespace pdftron;
using namespace PDF;
using namespace SDF;
using namespace Struct;
using namespace std;

 

/**
 * Creates a new SElement.
 */
SElement SElement_Create(PDFDoc& doc, const char* struct_type)
{
 Obj e = doc.CreateIndirectDict();
 e.PutName("S", struct_type); // The structure type
 return SElement(e);
}

/**
* Inserts the specified kid SElement into this element.
*
* @param insert_before The position after which the kid is inserted. If
* element currently has no kids, insert_before is ignored.
*/
void SElement_Insert(SElement& element, SElement& kid, int insert_before)
{
 assert(element.IsValid() && kid.IsValid());

 Obj st = element.GetSDFObj();
 Obj k = st.FindObj("K");
 if (!k || !k.IsArray()) {
  k = st.PutArray("K");
 }
 assert(k.IsArray());
 if (!k.Size()) insert_before = 0;
 Obj kid_obj = kid.GetSDFObj();
 k.Insert(insert_before, kid_obj);

 kid_obj.Put("P", st);  // Parent
}

/**
* @param insert_before The position after which the kid is inserted. If
* element currently has no kids, insert_before is ignored.
*/
int ContentItem_Create(PDFDoc& doc, SElement& element, Obj page, int insert_before = -1)
{
 assert(element.IsValid());

 Obj parent_obj = element.GetSDFObj();

 parent_obj.Put("Pg", page);

 Obj k = parent_obj.FindObj("K");
 if (!k || !k.IsArray()) {
  k = parent_obj.PutArray("K");
 }
 assert(k.IsArray());
 if (insert_before<0) insert_before = k.Size();
 if (!k.Size()) insert_before = 0;

 STree st = doc.GetStructTree();
 if (!st.IsValid()) { assert(false); }
 Obj st_obj = st.GetSDFObj();

 int struct_parent_idx = -1;
 Obj sp_num;
 if (sp_num = page.FindObj("StructParents")) {
  struct_parent_idx = int(sp_num.GetNumber());
 }
 else {
  Obj cnt = st_obj.FindObj("ParentTreeNextKey");
  if (!cnt || !cnt.IsNumber()) cnt = st_obj.PutNumber("ParentTreeNextKey", 0);
  struct_parent_idx = (int) cnt.GetNumber();
  page.PutNumber("StructParents", struct_parent_idx);
  cnt.SetNumber(struct_parent_idx+1);
 }

 Obj pt = st_obj.FindObj("ParentTree");
 if (!pt || !pt.IsDict()) {
  pt = st_obj.PutDict("ParentTree");
  pt.PutArray("Nums");
 }

 NumberTree nt(pt);
 assert(nt.IsValid());

 assert(struct_parent_idx>=0);
 Obj val = 0;
 DictIterator itr = nt.GetIterator(struct_parent_idx);
 if (itr.HasNext()) {
  val = itr.Value();
 }

 if (!val || !val.IsArray()) {
  val = doc.CreateIndirectArray();
  nt.Put(struct_parent_idx, val);
 }

 int mcid = (int) val.Size();
 val.PushBack(parent_obj);

 k.InsertNumber(insert_before, mcid);
 return mcid;
}

/**
 * Inserts the specified kid element after the given position as a kid of
 * the specified structure tree root.
 *
 * @param insert_before The position after which the kid is inserted. If
 * element currently has no kids, insert_before is ignored.
 */
void STree_Insert(STree& tree_root, SElement& kid, int insert_before)
{
 assert(tree_root.IsValid() && kid.IsValid());
 Obj st = tree_root.GetSDFObj();
 Obj k = st.FindObj("K");


 if (!k)  // Note: k can be a dict.
 {
  k = st.PutArray("K");
 }
 else
 { 
  //TODO: currently, we mandate existing StructTreeRoot is deleted. we need to fix this in
  //the future.
  return;
 }

 assert(k.IsArray());
 if (!k.Size()) insert_before = 0;
 Obj kid_obj = kid.GetSDFObj();
 k.Insert(insert_before, kid_obj);

 kid_obj.Put("P", st);  // Parent

 kid = tree_root.GetKid(insert_before);
}

STree STree_Create(PDFDoc& doc)
{
 STree sroot = doc.GetStructTree();
 if (sroot.IsValid()) { 
  return sroot;
 }

 // Create a structure tree if it is missing.
 Obj s = doc.CreateIndirectDict();
 s.PutName("Type", "StructTreeRoot");
 s.PutNumber("ParentTreeNextKey", 0);

 doc.GetRoot().Put("StructTreeRoot", s);
 return STree(s);
}

int main(int argc, char *argv[])
{
 int ret = 0;
 PDFNet::Initialize();

 // Relative path to the folder containing test files.
 string input_path =  "../../TestFiles/";
 string output_path = "../../TestFiles/Output/";

 try // Extract logical structure from a PDF document
 {
  //--preparation
  PDFDoc doc((input_path + "my.pdf").c_str());
  doc.InitSecurityHandler();

  //remove the existing structure tree
  doc.GetRoot().Erase("StructTreeRoot");

  Page pg = doc.GetPage(1);
  TRN_DispList dl = 0;
  REX(TRN_DispListCreate(pg.mp_page, &dl));

  //remove the existing structure tree
  pg.GetSDFObj().Erase("StructTreeRoot");

  STree sroot = STree_Create(doc);
  bool marked = false;
  TRN_Bool result;     //tell if there are any graphical elements marked
  TRN_Bool intersection_mode = 0;  //tag using intersecting (0) or containing(1)
  vector<double> rects;
  bool reshuffle = true;

  //--mark content 1
  SElement p1 = SElement_Create(doc, "P");
  STree_Insert(sroot, p1, 0);
  int mcid1 = ContentItem_Create(doc, p1, pg.GetSDFObj());
  Obj prop1 = doc.CreateIndirectDict();
  prop1.PutNumber("MCID", mcid1);
  rects.clear();
  rects.push_back(182);  rects.push_back(453);
  rects.push_back(227); rects.push_back(464);
  REX(TRN_DispListTag(dl, &rects[0], rects.size()/4, "P", prop1.mp_obj, intersection_mode, reshuffle, &result));
  marked |= TBToB(result);

  //--output file
  if ( marked )
  {
   REX(TRN_DispListSave(dl, pg.mp_page));

   doc.GetRoot().PutDict("MarkInfo").PutBool("Marked", true);
   doc.Save("../../TestFiles/tag_out.pdf", pdftron::SDF::SDFDoc::e_linearized, 0);
  }

  REX(TRN_DispListDestroy(dl));

  cout << "\nDone." << endl;
 }
 catch(Common::Exception& e)
 {
  cout << e << endl;
  ret = 1;
 }
 catch(...)
 {
  cout << "Unknown Exception" << endl;
  ret = 1;
 }

 PDFNet::Terminate();
 return ret;
}

Reply all
Reply to author
Forward
0 new messages