The
FPDFImageObj_GetImageDataDecoded
function fails to retrieve correct image data, so I can only useFPDFTextObj_GetRenderedBitmap
as an alternative, but this function is relatively time-consuming.I will provide a PDF file where
FPDFImageObj_GetImageDataDecoded
cannot correctly obtain pixel data for images embedded within Forms.
#include <fstream>
#include <functional>
#include <iostream>
#include <vector>
#include "fpdf_edit.h"
#include "fpdf_save.h"
#include "fpdf_text.h"
#include "fpdfview.h"
#define DEBUG_FOR_TEST 0
#define EXPORT_SMASK_RGB 0 // 导出png的 smask 和 rgb 数据
#define EXPORT_FILE_COMPRESS 0 // 导出为jpeg 文件进行压缩 (图像更小)
#define REMOVE_OLD_OBJ 1 // 移除原来的图像对象
namespace {
const std::string input_pdf = "D:\\input.pdf";
const std::string out_pdf = "D:\\output.pdf";
static int GetBlock(void* param,
unsigned long position,
unsigned char* pBuf,
unsigned long size) {
auto* buffer = static_cast<std::vector<unsigned char>*>(param);
if (position + size > buffer->size()) {
std::cerr << "Read request exceeds buffer bounds." << std::endl;
return 0;
}
memcpy(pBuf, buffer->data() + position, size);
return 1;
}
static int WriteBlock(FPDF_FILEWRITE* owner,
const void* buffer,
unsigned long size) {
static std::ofstream outFile(out_pdf, std::ios::binary);
if (!outFile) {
std::cerr << "Error: Unable to open output file." << std::endl;
return 0;
}
outFile.write(static_cast<const char*>(buffer), size);
return outFile ? 1 : 0;
}
bool GetImageObjectDecodedData(FPDF_PAGEOBJECT image_object,
std::vector<uint8_t>& image_data,
std::vector<uint8_t>& smask_data) {
image_data.clear();
smask_data.clear();
if (!image_object ||
FPDFPageObj_GetType(image_object) != FPDF_PAGEOBJ_IMAGE) {
return false;
}
// 获取主图像数据
unsigned long data_len =
FPDFImageObj_GetImageDataDecoded(image_object, nullptr, 0);
if (data_len == 0) {
return false;
}
image_data.resize(data_len);
unsigned long copied_len = FPDFImageObj_GetImageDataDecoded(
image_object, image_data.data(),
static_cast<unsigned long>(image_data.size()));
if (copied_len != data_len) {
image_data.clear();
return false;
}
#if 0
// 获取SMask数据
CPDF_Image* cpdf_image =
reinterpret_cast<CPDF_ImageObject*>(image_object)->m_pImage;
if (!cpdf_image) {
return true;
}
CPDF_Dictionary* image_dict = cpdf_image->GetDict();
if (!image_dict || !image_dict->KeyExist("SMask")) {
return true;
}
const CPDF_Stream* smask_stream = image_dict->GetStream("SMask");
if (!smask_stream) {
return true;
}
unsigned long smask_len = FPDFImageObj_GetImageDataDecoded(
reinterpret_cast<FPDF_PAGEOBJECT>(const_cast<CPDF_Stream*>(smask_stream)),
nullptr, 0);
if (smask_len > 0) {
smask_data.resize(smask_len);
unsigned long smask_copied_len = FPDFImageObj_GetImageDataDecoded(
reinterpret_cast<FPDF_PAGEOBJECT>(
const_cast<CPDF_Stream*>(smask_stream)),
smask_data.data(), static_cast<unsigned long>(smask_data.size()));
if (smask_copied_len != smask_len) {
smask_data.clear();
}
}
return true;
#endif
}
// 定义处理不同类型对象的回调函数类型
using PageObjHandler = std::function<void(FPDF_PAGEOBJECT, int level)>;
void TraversePageObjects(FPDF_PAGEOBJECT obj,
const PageObjHandler& handler,
int level = 0) {
if (!obj) {
return;
}
// 处理当前对象
handler(obj, level);
// 递归处理FORM对象
if (FPDFPageObj_GetType(obj) == FPDF_PAGEOBJ_FORM) {
int form_obj_count = FPDFFormObj_CountObjects(obj);
for (int i = 0; i < form_obj_count; ++i) {
FPDF_PAGEOBJECT inner_obj = FPDFFormObj_GetObject(obj, i);
TraversePageObjects(inner_obj, handler, level + 1);
}
}
}
// 示例:处理每种类型对象的函数
void HandlePageObject(FPDF_PAGEOBJECT obj, int level) {
const int type = FPDFPageObj_GetType(obj);
const std::string indent(level * 2, ' '); // 根据嵌套层级缩进
switch (type) {
case FPDF_PAGEOBJ_TEXT: {
std::cout << indent << "[TEXT] at level " << level << std::endl;
//// 提取文本内容示例:
// unsigned long len = FPDFTextObj_GetText(obj, nullptr, 0);
// if (len > 0) {
// std::vector<FPDF_WCHAR> buffer(len);
// FPDFTextObj_GetText(obj, buffer.data(), len);
// std::wcout << indent
// << " Content: " << std::wstring(buffer.data(), len - 1)
// << std::endl;
// }
break;
}
case FPDF_PAGEOBJ_PATH: {
std::cout << indent << "[PATH] at level " << level << std::endl;
// 可添加路径分析代码
break;
}
case FPDF_PAGEOBJ_IMAGE: {
std::cout << indent << "[IMAGE] at level " << level << std::endl;
// 图像处理代码(使用之前的GetImageObjectDecodedData)
std::vector<uint8_t> image_data, smask_data;
if (GetImageObjectDecodedData(obj, image_data, smask_data)) {
std::cout << indent << " Image size: " << image_data.size()
<< " bytes, SMask: " << smask_data.size() << " bytes"
<< std::endl;
}
break;
}
case FPDF_PAGEOBJ_SHADING: {
std::cout << indent << "[SHADING] at level " << level << std::endl;
// 可添加着色处理代码
break;
}
case FPDF_PAGEOBJ_FORM: {
std::cout << indent << "[FORM] at level " << level << std::endl;
// FORM对象已在TraversePageObjects中递归处理
break;
}
default:
std::cerr << indent << "[UNKNOWN TYPE " << type << "]" << std::endl;
}
}
} // namespace
int main() {
FPDF_InitLibrary();
// 加载PDF文档
FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
if (!document) {
std::cerr << "Failed to load PDF document." << std::endl;
FPDF_DestroyLibrary();
return -1;
}
int page_count = FPDF_GetPageCount(document);
if (page_count <= 0) {
std::cerr << "No pages in the document." << std::endl;
FPDF_CloseDocument(document);
FPDF_DestroyLibrary();
return -1;
}
// 遍历每一页
for (int i = 0; i < page_count; i++) {
FPDF_PAGE page = FPDF_LoadPage(document, i);
if (!page) {
std::cerr << "Failed to load page " << i << std::endl;
continue;
}
int obj_count = FPDFPage_CountObjects(page);
for (int j = 0; j < obj_count; j++) {
FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
if (!obj) {
std::cerr << " Failed to get object " << j << std::endl;
continue;
}
// 使用递归遍历函数处理对象及其嵌套内容
TraversePageObjects(obj, HandlePageObject);
}
FPDFPage_GenerateContent(page);
FPDF_ClosePage(page);
}
// 保存PDF文档
FPDF_FILEWRITE fileWrite;
fileWrite.version = 1;
fileWrite.WriteBlock = &WriteBlock;
if (!FPDF_SaveAsCopy(document, &fileWrite, FPDF_NO_INCREMENTAL)) {
std::cerr << "Failed to save PDF." << std::endl;
} else {
std::cout << "PDF saved successfully." << std::endl;
}
FPDF_CloseDocument(document);
FPDF_DestroyLibrary();
return 0;
}
#endif