Re: Type the translation. FPDFImageobj _ GetImageDataDecoded cannot get the image data embedded in the form correctly.

28 views
Skip to first unread message

尼古拉特斯拉

unread,
Aug 5, 2025, 4:25:01 AMAug 5
to pdfium
At present, it is found that the obtained pixel data is still an image in jfif format, which is not the pixel data I need.

On Tue, 5 Aug 2025 at 16:13, 尼古拉特斯拉 <20021...@gmail.com> wrote:
 

The FPDFImageObj_GetImageDataDecoded function fails to retrieve correct image data, so I can only use FPDFTextObj_GetRenderedBitmap as an alternative, but this function is relatively time-consuming.

I will provide a PDF file where FPDFImageObj_GetImageDataDecoded cannot correctly obtain pixel data for images embedded within Forms.   






#include <fstream>
#include <functional>
#include <iostream>
#include <vector>
#include "fpdf_edit.h"
#include "fpdf_save.h"
#include "fpdf_text.h"
#include "fpdfview.h"

#define DEBUG_FOR_TEST 0
#define EXPORT_SMASK_RGB 0      // 导出png的 smask 和 rgb 数据
#define EXPORT_FILE_COMPRESS 0  // 导出为jpeg 文件进行压缩 (图像更小)
#define REMOVE_OLD_OBJ 1        // 移除原来的图像对象

namespace {
const std::string input_pdf = "D:\\input.pdf";
const std::string out_pdf = "D:\\output.pdf";

static int GetBlock(void* param,
                    unsigned long position,
                    unsigned char* pBuf,
                    unsigned long size) {
  auto* buffer = static_cast<std::vector<unsigned char>*>(param);
  if (position + size > buffer->size()) {
    std::cerr << "Read request exceeds buffer bounds." << std::endl;
    return 0;
  }
  memcpy(pBuf, buffer->data() + position, size);
  return 1;
}

static int WriteBlock(FPDF_FILEWRITE* owner,
                      const void* buffer,
                      unsigned long size) {
  static std::ofstream outFile(out_pdf, std::ios::binary);
  if (!outFile) {
    std::cerr << "Error: Unable to open output file." << std::endl;
    return 0;
  }
  outFile.write(static_cast<const char*>(buffer), size);
  return outFile ? 1 : 0;
}

bool GetImageObjectDecodedData(FPDF_PAGEOBJECT image_object,
                               std::vector<uint8_t>& image_data,
                               std::vector<uint8_t>& smask_data) {
  image_data.clear();
  smask_data.clear();

  if (!image_object ||
      FPDFPageObj_GetType(image_object) != FPDF_PAGEOBJ_IMAGE) {
    return false;
  }

  // 获取主图像数据
  unsigned long data_len =
      FPDFImageObj_GetImageDataDecoded(image_object, nullptr, 0);
  if (data_len == 0) {
    return false;
  }

  image_data.resize(data_len);
  unsigned long copied_len = FPDFImageObj_GetImageDataDecoded(
      image_object, image_data.data(),
      static_cast<unsigned long>(image_data.size()));

  if (copied_len != data_len) {
    image_data.clear();
    return false;
  }
#if 0
  // 获取SMask数据
  CPDF_Image* cpdf_image =
      reinterpret_cast<CPDF_ImageObject*>(image_object)->m_pImage;
  if (!cpdf_image) {
    return true;
  }

  CPDF_Dictionary* image_dict = cpdf_image->GetDict();
  if (!image_dict || !image_dict->KeyExist("SMask")) {
    return true;
  }

  const CPDF_Stream* smask_stream = image_dict->GetStream("SMask");
  if (!smask_stream) {
    return true;
  }

  unsigned long smask_len = FPDFImageObj_GetImageDataDecoded(
      reinterpret_cast<FPDF_PAGEOBJECT>(const_cast<CPDF_Stream*>(smask_stream)),
      nullptr, 0);

  if (smask_len > 0) {
    smask_data.resize(smask_len);
    unsigned long smask_copied_len = FPDFImageObj_GetImageDataDecoded(
        reinterpret_cast<FPDF_PAGEOBJECT>(
            const_cast<CPDF_Stream*>(smask_stream)),
        smask_data.data(), static_cast<unsigned long>(smask_data.size()));

    if (smask_copied_len != smask_len) {
      smask_data.clear();
    }
  }

  return true;
#endif
}

// 定义处理不同类型对象的回调函数类型
using PageObjHandler = std::function<void(FPDF_PAGEOBJECT, int level)>;

void TraversePageObjects(FPDF_PAGEOBJECT obj,
                         const PageObjHandler& handler,
                         int level = 0) {
  if (!obj) {
    return;
  }

  // 处理当前对象
  handler(obj, level);

  // 递归处理FORM对象
  if (FPDFPageObj_GetType(obj) == FPDF_PAGEOBJ_FORM) {
    int form_obj_count = FPDFFormObj_CountObjects(obj);
    for (int i = 0; i < form_obj_count; ++i) {
      FPDF_PAGEOBJECT inner_obj = FPDFFormObj_GetObject(obj, i);
      TraversePageObjects(inner_obj, handler, level + 1);
    }
  }
}

// 示例:处理每种类型对象的函数
void HandlePageObject(FPDF_PAGEOBJECT obj, int level) {
  const int type = FPDFPageObj_GetType(obj);
  const std::string indent(level * 2, ' ');  // 根据嵌套层级缩进

  switch (type) {
    case FPDF_PAGEOBJ_TEXT: {
      std::cout << indent << "[TEXT] at level " << level << std::endl;
      //// 提取文本内容示例:
      // unsigned long len = FPDFTextObj_GetText(obj, nullptr, 0);
      // if (len > 0) {
      //   std::vector<FPDF_WCHAR> buffer(len);
      //   FPDFTextObj_GetText(obj, buffer.data(), len);
      //   std::wcout << indent
      //              << "  Content: " << std::wstring(buffer.data(), len - 1)
      //              << std::endl;
      // }
      break;
    }
    case FPDF_PAGEOBJ_PATH: {
      std::cout << indent << "[PATH] at level " << level << std::endl;
      // 可添加路径分析代码
      break;
    }
    case FPDF_PAGEOBJ_IMAGE: {
      std::cout << indent << "[IMAGE] at level " << level << std::endl;
      // 图像处理代码(使用之前的GetImageObjectDecodedData)
      std::vector<uint8_t> image_data, smask_data;
      if (GetImageObjectDecodedData(obj, image_data, smask_data)) {
        std::cout << indent << "  Image size: " << image_data.size()
                  << " bytes, SMask: " << smask_data.size() << " bytes"
                  << std::endl;
      }
      break;
    }
    case FPDF_PAGEOBJ_SHADING: {
      std::cout << indent << "[SHADING] at level " << level << std::endl;
      // 可添加着色处理代码
      break;
    }
    case FPDF_PAGEOBJ_FORM: {
      std::cout << indent << "[FORM] at level " << level << std::endl;
      // FORM对象已在TraversePageObjects中递归处理
      break;
    }
    default:
      std::cerr << indent << "[UNKNOWN TYPE " << type << "]" << std::endl;
  }
}
}  // namespace

int main() {
  FPDF_InitLibrary();

  // 加载PDF文档
  FPDF_DOCUMENT document = FPDF_LoadDocument(input_pdf.c_str(), nullptr);
  if (!document) {
    std::cerr << "Failed to load PDF document." << std::endl;
    FPDF_DestroyLibrary();
    return -1;
  }

  int page_count = FPDF_GetPageCount(document);
  if (page_count <= 0) {
    std::cerr << "No pages in the document." << std::endl;
    FPDF_CloseDocument(document);
    FPDF_DestroyLibrary();
    return -1;
  }

  // 遍历每一页
  for (int i = 0; i < page_count; i++) {
    FPDF_PAGE page = FPDF_LoadPage(document, i);
    if (!page) {
      std::cerr << "Failed to load page " << i << std::endl;
      continue;
    }

    int obj_count = FPDFPage_CountObjects(page);

    for (int j = 0; j < obj_count; j++) {
      FPDF_PAGEOBJECT obj = FPDFPage_GetObject(page, j);
      if (!obj) {
        std::cerr << "  Failed to get object " << j << std::endl;
        continue;
      }

      // 使用递归遍历函数处理对象及其嵌套内容
      TraversePageObjects(obj, HandlePageObject);
    }

    FPDFPage_GenerateContent(page);
    FPDF_ClosePage(page);
  }

  // 保存PDF文档
  FPDF_FILEWRITE fileWrite;
  fileWrite.version = 1;
  fileWrite.WriteBlock = &WriteBlock;

  if (!FPDF_SaveAsCopy(document, &fileWrite, FPDF_NO_INCREMENTAL)) {
    std::cerr << "Failed to save PDF." << std::endl;
  } else {
    std::cout << "PDF saved successfully." << std::endl;
  }

  FPDF_CloseDocument(document);
  FPDF_DestroyLibrary();

  return 0;
}

#endif
Reply all
Reply to author
Forward
0 new messages