Drive.Files.copy with OCR fails with "Invalid Argument" inside a loop, but works in isolation.

54 views
Skip to first unread message

Namit Gupta

unread,
Jul 17, 2025, 3:47:04 AMJul 17
to Google Apps Script Community

Hello,

I have a Google Apps Script that is designed to perform OCR on a PDF from Gmail and save the data to a Google Sheet.

The script consistently fails on the Drive.Files.copy() or Drive.Files.create() command with the error: Exception: Invalid argument.

The strange part is that when I run the exact same API call in a separate, isolated test function on a known file, it works perfectly. The failure only happens when the call is made inside the main function's loops.

We have already tried:

  • Rewriting forEach loops as standard for loops.

  • Forcing re-authorization of all scopes, including the drive scope.

  • Ensuring the Advanced Drive API service is enabled.

  • Verifying all parameters passed to the API call.

This seems like a platform bug or a strange execution context issue. Can anyone see what might be causing the API call to fail only when run inside the full script?

Here is the complete code, which includes the main failing function (processInvoices) and the successful diagnostic function (runFinalOcrTest).


// --- SCRIPT CONFIGURATION ---
// Using a placeholder email for privacy
var GMAIL_QUERY = 'from:ven...@example.com subject:"Invoice" has:attachment is:unread';
var FOLDER_NAME = 'Gmail Invoices';

/**
 * Main function that fails.
 */
function processInvoices() {
  var folders = DriveApp.getFoldersByName(FOLDER_NAME);
  var folder = folders.hasNext() ? folders.next() : DriveApp.createFolder(FOLDER_NAME);

  var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
 
  if (sheet.getRange("A1").getValue() === "") {
    var headers = [
      "Invoice Number", "Invoice Date", "Grand Total", "Buyer Name", "Buyer GSTIN",
      "Contract No", "Delivery Order No", "Truck No", "HSN No", "Material",
      "Quantity (MTS)", "Rate", "Gross Amount", "IGST Amount", "Cess Amount",
      "TCS Amount", "Amount in Words", "IRN No", "File Name"
    ];
    sheet.getRange(1, 1, 1, headers.length).setValues([headers]).setFontWeight("bold");
  }
 
  var threads = GmailApp.search(GMAIL_QUERY);
 
  for (var i = 0; i < threads.length; i++) {
    var messages = threads[i].getMessages();
    for (var j = 0; j < messages.length; j++) {
      var message = messages[j];
      var attachments = message.getAttachments();
      for (var k = 0; k < attachments.length; k++) {
        var attachment = attachments[k];
       
        if (attachment.getContentType() === 'application/pdf') {
          var fileName = attachment.getName();
          var pdfText;

          try {
            var originalPdfFile = folder.createFile(attachment);
            var ocrOptions = { ocr: true, ocrLanguage: 'en' };
            var copyResource = { name: 'temp_ocr_doc_for_' + fileName };
           
            var ocrFile = Drive.Files.copy(copyResource, originalPdfFile.getId(), ocrOptions);
           
            var doc = DocumentApp.openById(ocrFile.id);
            pdfText = doc.getBody().getText();
           
            Drive.Files.remove(ocrFile.id);
            Logger.log('Successfully performed OCR on: ' + fileName);

          } catch (e) {
            Logger.log('Error during OCR processing for ' + fileName + ': ' + e.toString());
            continue;
          }
         
          var details = extractAllDetailsFromPDF(pdfText);
         
          if (details.invoiceNumber) {
            sheet.appendRow([
              details.invoiceNumber, details.invoiceDate, details.grandTotal, details.buyerName, details.buyerGstin,
              details.contractNo, details.deliveryOrderNo, details.truckNo, details.hsnNo, details.material,
              details.quantity, details.rate, details.grossAmount, details.igst, details.cess,
              details.tcs, details.amountInWords, details.irnNo, fileName
            ]);
            Logger.log('Successfully extracted details from ' + fileName);
          } else {
            Logger.log('Could not extract invoice number from ' + fileName + ' after OCR. Skipping.');
          }
         
          Utilities.sleep(1000);
        }
      }
      message.markRead();
    }
  }
}

/**
 * Diagnostic function that SUCCEEDS.
 */
function runFinalOcrTest() {
  var fileId = 'PASTE_A_VALID_FILE_ID_HERE'; // Hardcoded ID of a saved PDF
  var fileName = 'sample_invoice.pdf';
 
  Logger.log('Starting final diagnostic test on a known file ID: ' + fileId);
 
  try {
    var ocrOptions = { ocr: true, ocrLanguage: 'en' };
    var copyResource = { name: 'final_diagnostic_test_' + fileName };
   
    var ocrFile = Drive.Files.copy(copyResource, fileId, ocrOptions);
   
    Logger.log('SUCCESS: The API call worked.');
    Drive.Files.remove(ocrFile.id);

  } catch (e) {
    Logger.log('ERROR: The API call failed. Error: ' + e);
  }
}

/**
 * The data extraction function (regex patterns).
 */
function extractAllDetailsFromPDF(text) {
  var details = {
    invoiceNumber: null, invoiceDate: null, grandTotal: null, buyerName: null, buyerGstin: null,
    contractNo: null, deliveryOrderNo: null, truckNo: null, hsnNo: null, material: null,
    quantity: null, rate: null, grossAmount: null, igst: null, cess: null,
    tcs: null, amountInWords: null, irnNo: null
  };

  var match;

  match = text.match(/Invoice No\.\s?\/\s?Date[^\d]*(\d{12})/i);
  if (match) details.invoiceNumber = match[1];

  match = text.match(/Invoice No\.\s?\/\s?Date[^\d]*\d{12}\/(\d{2}\.\d{2}\.\d{4})/i);
  if (match) details.invoiceDate = match[1];
 
  match = text.match(/INR\s+([\d,]+\.\d{2})/i);
  if (match) details.grandTotal = parseFloat(match[1].replace(/,/g, ''));

  match = text.match(/Buyer\s*\n(.*?)\n/i);
  if (match) details.buyerName = match[1].trim();

  match = text.match(/Buyer\s*(?:.|\n)*?GSTIN:(\S+)/i);
  if (match) details.buyerGstin = match[1];

  match = text.match(/Contract No\.\s?\/\s?Date.*:\s*(\d+)/i);
  if (match) details.contractNo = match[1];

  match = text.match(/Delivery Order No\.\s*:\s*(\d+)/i);
  if (match) details.deliveryOrderNo = match[1];

  match = text.match(/Truck No\/?RR No\s*:\s*(\S+)/i);
  if (match) details.truckNo = match[1];

  match = text.match(/HSN No\s*:\s*(\d+)/i);
  if (match) details.hsnNo = match[1];

  match = text.match(/Material Code Description\s*\n(.*?)\n/i);
  if (match) details.material = match[1].trim();
 
  match = text.match(/(\d+\.\d+)\s+MTS\s+[\d,]+\.\d{2}/i);
  if (match) details.quantity = parseFloat(match[1]);

  match = text.match(/\d+\.\d+\s+MTS\s+([\d,]+\.\d{2})/i);
  if (match) details.rate = parseFloat(match[1].replace(/,/g, ''));
 
  match = text.match(/Net Price\s+([\d,]+\.\d{2})/i);
  if (match) details.grossAmount = parseFloat(match[1].replace(/,/g, ''));

  match = text.match(/Integrated GST 5%\s+([\d,]+\.\d{2})/i);
  if (match) details.igst = parseFloat(match[1].replace(/,/g, ''));

  match = text.match(/GST Comp Cess\s+([\d,]+\.\d{2})/i);
  if (match) details.cess = parseFloat(match[1].replace(/,/g, ''));

  match = text.match(/TCS 1%\s+([\d,]+\.\d{2})/i);
  if (match) details.tcs = parseFloat(match[1].replace(/,/g, ''));
 
  match = text.match(/(INR .*? Paisa Only)/i);
  if (match) details.amountInWords = match[1];

  match = text.match(/IRN NO\.:(\S+)/i);
  if (match) details.irnNo = match[1];

  return details;
}

SMAARTE Group

unread,
Jul 17, 2025, 10:03:57 PMJul 17
to google-apps-sc...@googlegroups.com
Namit,

I am not a coder, but I did spend several hours working with Gemini Pro 2.5 to implement an apps script to do something similar to what you're looking for and perhaps that will be helpful for you.  There are two other GS files associated with this project, but I believe what I've shared below will be the most helpful.

appsscript.json
{
  "timeZone": "America/Los_Angeles",
  "exceptionLogging": "STACKDRIVER",
  "runtimeVersion": "V8",
  "oauthScopes": [
  ],
  "webapp": {
    "executeAs": "USER_DEPLOYING",
    "access": "ANYONE_ANONYMOUS"
  },
  "dependencies": {
    "enabledAdvancedServices": [
      {
        "userSymbol": "Drive",
        "version": "v3",
        "serviceId": "drive"
      },
      {
        "userSymbol": "Docs",
        "serviceId": "docs",
        "version": "v1"
      }
    ]
  }
}


InvoiceTracking.gs
// InvoiceTracking.gs ==============================================
// SCRIPT CONSTANTS - CONFIGURE YOUR SETTINGS HERE
// =================================================================

const TARGET_EMAIL_QUERY = "to:xxxxx label:unread";
const CONFIRMATION_EMAIL_SENDER = "xxxxx";
const ORIGINALS_FOLDER_ID = "xxxxx";
const OCR_FOLDER_ID = "xxxxx";
const SHEET_NAME = "xxxxx";
const VENDORS_SHEET_NAME = "VENDORS";

// =================================================================
// HELPER FUNCTION to read from the VENDORS sheet
// =================================================================

/**
 * Reads the vendor and category data from the "VENDORS" sheet.
 * @returns {Array<{vendor: string, category: string}>} An array of vendor/category objects.
 */
function getVendorData() {
  try {
    const spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
    const vendorSheet = spreadsheet.getSheetByName(VENDORS_SHEET_NAME);
    if (!vendorSheet) {
      console.error(`Vendor sheet named "${VENDORS_SHEET_NAME}" not found.`);
      return [];
    }
    // Assumes categories are in column A and vendors are in column B, starting from row 2.
    const data = vendorSheet.getRange("A2:B" + vendorSheet.getLastRow()).getValues();
    const vendorMap = data
      .filter(row => row[0] && row[1]) // Ensure neither cell is blank
      .map(row => ({ category: row[0], vendor: row[1] }));

    // Sort by vendor name length (descending) to match longest names first (e.g., "GS Cleaning Services" before "GS Cleaning").
    vendorMap.sort((a, b) => b.vendor.length - a.vendor.length);
   
    console.log(`Successfully loaded ${vendorMap.length} vendor/category pairs.`);
    return vendorMap;
  } catch (e) {
    console.error(`Error reading from vendor sheet: ${e.toString()}`);
    return [];
  }
}


// =================================================================
// MAIN FUNCTION
// =================================================================

/**
 * This is the main function that will be triggered to process invoices.
 */
function processInvoices() {
  const vendorData = getVendorData(); // Load vendor data once.
  if (vendorData.length === 0) {
    console.error("Halting execution because no vendor data could be loaded. Please check the 'VENDORS' sheet.");
    return;
  }

  const threads = GmailApp.search(TARGET_EMAIL_QUERY);
  const spreadsheet = SpreadsheetApp.getActiveSpreadsheet();
  const sheet = spreadsheet.getSheetByName(SHEET_NAME);

  if (!sheet) {
    console.error(`Sheet named "${SHEET_NAME}" not found. Please check the SHEET_NAME constant.`);
    return;
  }

  const originalsFolder = DriveApp.getFolderById(ORIGINALS_FOLDER_ID);
  const ocrFolder = DriveApp.getFolderById(OCR_FOLDER_ID);

  for (const thread of threads) {
    const messages = thread.getMessages();
    for (const message of messages) {
      const fromHeader = message.getFrom();
      const emailMatch = fromHeader.match(/<(.+)>/);
      const senderEmail = emailMatch ? emailMatch[1] : fromHeader;
      const receivedDate = message.getDate();
     
      const attachments = message.getAttachments();

      if (attachments.length > 0) {
        console.log(`Processing email from "${senderEmail}" with subject: "${message.getSubject()}"`);

        const attachmentInfo = processAttachments(attachments, originalsFolder, ocrFolder);
        if (attachmentInfo.length > 0) {
          const allText = attachmentInfo.map(info => info.ocrText).join('\n\n---\n\n');
          const invoiceData = extractInvoiceData(allText, vendorData); // Pass vendor data to parser
          const attachmentNames = attachmentInfo.map(info => info.fileName);

          updateSpreadsheet(
            sheet,
            invoiceData,
            receivedDate,
            senderEmail,
            attachmentInfo.map(info => info.originalFile),
            attachmentNames
          );

          shareFilesSilently(senderEmail, spreadsheet, attachmentInfo.map(info => info.originalFile));
         
          if (SEND_CONFIRMATION_EMAIL) {
            sendConfirmationEmail(senderEmail, spreadsheet.getUrl(), attachmentNames);
          } else {
            console.log("Confirmation email sending is disabled by the SEND_CONFIRMATION_EMAIL constant.");
          }
        }
      }
    }
    labelAndArchiveThread(thread);
    thread.markRead();
  }
  console.log("Invoice processing complete.");
}

/**
 * Saves attachments, creates OCR'd versions, and extracts text from paragraphs AND tables.
 */
function processAttachments(attachments, originalsFolder, ocrFolder) {
  const attachmentInfo = [];
  for (const attachment of attachments) {
    try {
      if (!attachment.getContentType().startsWith('application/pdf') && !attachment.getContentType().startsWith('image/')) {
        console.log(`Skipping attachment "${attachment.getName()}" as it is not a PDF or image.`);
        continue;
      }
      const originalFile = originalsFolder.createFile(attachment);
      const fileName = originalFile.getName();
      console.log(`Saved original file: ${fileName}`);

      const ocrFileResource = { name: `${fileName} (OCR Text)`, parents: [OCR_FOLDER_ID], mimeType: 'application/vnd.google-apps.document' };
      const ocrFile = Drive.Files.create(ocrFileResource, attachment.copyBlob(), { ocr: true, ocrLanguage: 'en' });
      console.log(`Created OCR file with ID: ${ocrFile.id}`);
      Utilities.sleep(3000);

      const doc = Docs.Documents.get(ocrFile.id);
      let ocrText = '';
      doc.body.content.forEach(element => {
        if (element.paragraph) {
          element.paragraph.elements.forEach(textRun => {
            if(textRun.textRun) {
              ocrText += textRun.textRun.content;
            }
          });
        } else if (element.table && element.table.tableRows) {
          element.table.tableRows.forEach(row => {
            if (row.tableCells) {
              row.tableCells.forEach(cell => {
                if (cell.content) {
                  cell.content.forEach(cellContent => {
                    if (cellContent.paragraph) {
                      cellContent.paragraph.elements.forEach(textRun => {
                         if(textRun.textRun) {
                            ocrText += textRun.textRun.content + '\t';
                         }
                      });
                    }
                  });
                }
              });
            }
            ocrText += '\n';
          });
        }
      });
     
      attachmentInfo.push({ originalFile, ocrText, fileName });
    } catch (e) {
      console.error(`Failed to process attachment ${attachment.getName()}. Error: ${e.toString()}`);
    }
  }
  return attachmentInfo;
}


/**
 * Helper function to find the invoice amount from text using multiple patterns in a specific order.
 */
function findInvoiceAmount(text) {
  console.log("--- Starting Amount Extraction ---");
  console.log("Full text for amount extraction:\n" + text);

  let match = text.match(/Grand Total[\s\S]*?\$?\s*([\d,]+\.\d{2})/i);
  if (match && match[1]) {
    console.log(`SUCCESS: Found amount with 'Grand Total' pattern. Match: ${match[0]}`);
    return parseFloat(match[1].replace(/[$,]/g, ''));
  }

  match = text.match(/(?:Amount due:|TOTAL AMOUNT NOW DUE)\s*\$?([\d,]+\.\d{2})/i);
  if (match && match[1]) {
    console.log(`SUCCESS: Found amount with 'Amount due:' pattern. Match: ${match[0]}`);
    return parseFloat(match[1].replace(/[$,"]/g, ''));
  }

  match = text.match(/\bTotal\b\s*\$?([\d,]+\.\d{2})/i);
  if (match && match[1]) {
    console.log(`SUCCESS: Found amount with 'Total' pattern. Match: ${match[0]}`);
    return parseFloat(match[1].replace(/[$,"]/g, ''));
  }
 
  const allAmounts = text.match(/\$?[\d,]+\.\d{2}/g);
  if (allAmounts) {
    const numericAmounts = allAmounts
      .map(v => parseFloat(v.replace(/[$,"]/g, '')))
      .filter(v => !isNaN(v));
    if (numericAmounts.length > 0) {
      const maxAmount = Math.max(...numericAmounts);
      console.log(`SUCCESS: Fallback selected largest amount: ${maxAmount}`);
      return maxAmount;
    }
  }

  console.log("FAILURE: Could not find any invoice amount in the text.");
  return null;
}

/**
 * Converts a string to proper case (e.g., "HELLO WORLD" -> "Hello World").
 */
function toProperCase(str) {
  if (!str) return str;
  return str.replace(/\w\S*/g, txt => txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase());
}

/**
 * Attempts to extract structured data from the OCR'd text.
 * @param {string} text The OCR'd text to parse.
 * @param {Array<{vendor: string, category: string}>} vendorData The map of vendors and categories.
 */
function extractInvoiceData(text, vendorData) {
  console.log("--- Starting Data Extraction ---");
  console.log("Full text for data extraction:\n" + text);

  let invoiceDate = null;
  let dueDate = null;
  let invoiceNumber = null;

  // 1. Look for specific keywords for dates and invoice number
  let match = text.match(/(?:Invoice date:|ISSUED:)\s*(\w+\s\d{1,2},\s\d{4}|\d{1,2}\/\d{1,2}\/\d{2,4})/i);
  if (match && match[1]) {
    invoiceDate = new Date(match[1]);
  }

  match = text.match(/Due date:?\s*(\w+\s\d{1,2},\s\d{4}|\d{1,2}\/\d{1,2}\/\d{2,4})/i);
  if (match && match[1]) {
    dueDate = new Date(match[1]);
  }
 
  match = text.match(/(?:Invoice No:|Invoice No\.|INVOICE #|Invoice number:|Invoice#:)\s*([A-Z0-9-.\s]+)/i);
  if (match && match[1]) {
    invoiceNumber = match[1].trim().split('\n')[0];
    // MODIFICATION: Truncate invoice number to 12 characters.
    if (invoiceNumber) {
      invoiceNumber = invoiceNumber.substring(0, 12);
    }
  }

  // 2. If invoice date is still not found, do a generic search.
  if (!invoiceDate) {
    const genericDateMatch = text.match(/(?<!Due\sdate:?\s*)(\b\w+\s\d{1,2},\s\d{4}\b|\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/i);
    if (genericDateMatch && genericDateMatch[1]) {
       invoiceDate = new Date(genericDateMatch[1]);
    }
  }
 
  const amount = findInvoiceAmount(text);
 
  // MODIFICATION: Use the dynamic vendorData list for matching.
  let vendor = "UNKNOWN";
  let category = "UNCATEGORIZED";

  for (const item of vendorData) {
    // Escape special characters in vendor name for regex
    const escapedVendor = item.vendor.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
    const vendorRegex = new RegExp(`\\b${escapedVendor}\\b`, "i");
    if (text.match(vendorRegex)) {
        vendor = item.vendor;
        category = item.category;
        console.log(`SUCCESS: Matched vendor "${vendor}" with category "${category}".`);
        break; // Stop after the first (and longest) match is found.
    }
  }

  if (category !== "UNCATEGORIZED") {
    category = toProperCase(category);
  }
 
  console.log(`Extracted Data: Invoice Date=${invoiceDate}, Due Date=${dueDate}, Inv#=${invoiceNumber}, Amount=${amount}, Vendor=${vendor}, Category=${category}`);
  return { invoiceDate, dueDate, invoiceNumber, amount, category, vendor };
}


Regards,


--
You received this message because you are subscribed to the Google Groups "Google Apps Script Community" group.
To unsubscribe from this group and stop receiving emails from it, send an email to google-apps-script-c...@googlegroups.com.
To view this discussion visit https://groups.google.com/d/msgid/google-apps-script-community/201d53fb-fc13-4062-bf67-2d965dd2292dn%40googlegroups.com.
Reply all
Reply to author
Forward
0 new messages