As a starting point you may want to take a look at LogicalStructure sample project:
http://www.pdftron.com/pdfnet/samplecode.html#LogicalStructure
Attached is another example of how to tag a PDF. This example is a bit more involved and is using C++. If you need to automatically tag PDFs (as opposed to manual tagging) you could use ‘pdftron.PDF.TextExtractor’ and ‘pdftron.PDF.ElementReader’ as a starting point for the analysis.
// PDFTagTest.cpp ...
#include <PDF/PDFNet.h>
#include <PDF/PDFDoc.h>
#include <PDF/ElementReader.h>
#include <PDF/Struct/SElement.h>
#include <PDF/Struct/ContentItem.h>
#include <SDF/NumberTree.h>
#include <iostream>
#include <map>
using namespace pdftron;
using namespace PDF;
using namespace SDF;
using namespace Struct;
using namespace std;
/**
* Creates a new SElement.
*/
SElement SElement_Create(PDFDoc& doc, const char* struct_type)
{
Obj e = doc.CreateIndirectDict();
e.PutName("S", struct_type); // The structure type
return SElement(e);
}
/**
* Inserts the specified kid SElement into this element.
*
* @param insert_before The position after which the kid is inserted. If
* element currently has no kids, insert_before is ignored.
*/
void SElement_Insert(SElement& element, SElement& kid, int insert_before)
{
assert(element.IsValid() && kid.IsValid());
Obj st = element.GetSDFObj();
Obj k = st.FindObj("K");
if (!k || !k.IsArray()) {
k = st.PutArray("K");
}
assert(k.IsArray());
if (!k.Size()) insert_before = 0;
Obj kid_obj = kid.GetSDFObj();
k.Insert(insert_before, kid_obj);
kid_obj.Put("P", st); // Parent
}
/**
* @param insert_before The position after which the kid is inserted. If
* element currently has no kids, insert_before is ignored.
*/
int ContentItem_Create(PDFDoc& doc, SElement& element, Obj page, int insert_before = -1)
{
assert(element.IsValid());
Obj parent_obj = element.GetSDFObj();
parent_obj.Put("Pg", page);
Obj k = parent_obj.FindObj("K");
if (!k || !k.IsArray()) {
k = parent_obj.PutArray("K");
}
assert(k.IsArray());
if (insert_before<0) insert_before = k.Size();
if (!k.Size()) insert_before = 0;
STree st = doc.GetStructTree();
if (!st.IsValid()) { assert(false); }
Obj st_obj = st.GetSDFObj();
int struct_parent_idx = -1;
Obj sp_num;
if (sp_num = page.FindObj("StructParents")) {
struct_parent_idx = int(sp_num.GetNumber());
}
else {
Obj cnt = st_obj.FindObj("ParentTreeNextKey");
if (!cnt || !cnt.IsNumber()) cnt = st_obj.PutNumber("ParentTreeNextKey", 0);
struct_parent_idx = (int) cnt.GetNumber();
page.PutNumber("StructParents", struct_parent_idx);
cnt.SetNumber(struct_parent_idx+1);
}
Obj pt = st_obj.FindObj("ParentTree");
if (!pt || !pt.IsDict()) {
pt = st_obj.PutDict("ParentTree");
pt.PutArray("Nums");
}
NumberTree nt(pt);
assert(nt.IsValid());
assert(struct_parent_idx>=0);
Obj val = 0;
DictIterator itr = nt.GetIterator(struct_parent_idx);
if (itr.HasNext()) {
val = itr.Value();
}
if (!val || !val.IsArray()) {
val = doc.CreateIndirectArray();
nt.Put(struct_parent_idx, val);
}
int mcid = (int) val.Size();
val.PushBack(parent_obj);
k.InsertNumber(insert_before, mcid);
return mcid;
}
/**
* Inserts the specified kid element after the given position as a kid of
* the specified structure tree root.
*
* @param insert_before The position after which the kid is inserted. If
* element currently has no kids, insert_before is ignored.
*/
void STree_Insert(STree& tree_root, SElement& kid, int insert_before)
{
assert(tree_root.IsValid() && kid.IsValid());
Obj st = tree_root.GetSDFObj();
Obj k = st.FindObj("K");
if (!k) // Note: k can be a dict.
{
k = st.PutArray("K");
}
else
{
//TODO: currently, we mandate existing StructTreeRoot is deleted. we need to fix this in
//the future.
return;
}
assert(k.IsArray());
if (!k.Size()) insert_before = 0;
Obj kid_obj = kid.GetSDFObj();
k.Insert(insert_before, kid_obj);
kid_obj.Put("P", st); // Parent
kid = tree_root.GetKid(insert_before);
}
STree STree_Create(PDFDoc& doc)
{
STree sroot = doc.GetStructTree();
if (sroot.IsValid()) {
return sroot;
}
// Create a structure tree if it is missing.
Obj s = doc.CreateIndirectDict();
s.PutName("Type", "StructTreeRoot");
s.PutNumber("ParentTreeNextKey", 0);
doc.GetRoot().Put("StructTreeRoot", s);
return STree(s);
}
int main(int argc, char *argv[])
{
int ret = 0;
PDFNet::Initialize();
// Relative path to the folder containing test files.
string input_path = "../../TestFiles/";
string output_path = "../../TestFiles/Output/";
try // Extract logical structure from a PDF document
{
//--preparation
PDFDoc doc((input_path + "my.pdf").c_str());
doc.InitSecurityHandler();
//remove the existing structure tree
doc.GetRoot().Erase("StructTreeRoot");
Page pg = doc.GetPage(1);
TRN_DispList dl = 0;
REX(TRN_DispListCreate(pg.mp_page, &dl));
//remove the existing structure tree
pg.GetSDFObj().Erase("StructTreeRoot");
STree sroot = STree_Create(doc);
bool marked = false;
TRN_Bool result; //tell if there are any graphical elements marked
TRN_Bool intersection_mode = 0; //tag using intersecting (0) or containing(1)
vector<double> rects;
bool reshuffle = true;
//--mark content 1
SElement p1 = SElement_Create(doc, "P");
STree_Insert(sroot, p1, 0);
int mcid1 = ContentItem_Create(doc, p1, pg.GetSDFObj());
Obj prop1 = doc.CreateIndirectDict();
prop1.PutNumber("MCID", mcid1);
rects.clear();
rects.push_back(182); rects.push_back(453);
rects.push_back(227); rects.push_back(464);
REX(TRN_DispListTag(dl, &rects[0], rects.size()/4, "P", prop1.mp_obj, intersection_mode, reshuffle, &result));
marked |= TBToB(result);
//--output file
if ( marked )
{
REX(TRN_DispListSave(dl, pg.mp_page));
doc.GetRoot().PutDict("MarkInfo").PutBool("Marked", true);
doc.Save("../../TestFiles/tag_out.pdf", pdftron::SDF::SDFDoc::e_linearized, 0);
}
REX(TRN_DispListDestroy(dl));
cout << "\nDone." << endl;
}
catch(Common::Exception& e)
{
cout << e << endl;
ret = 1;
}
catch(...)
{
cout << "Unknown Exception" << endl;
ret = 1;
}
PDFNet::Terminate();
return ret;
}