Hi:)
I'm building a Chrome extension that uses Tesseract.js for OCR, but I'm hitting two different issues depending on how I try to load the worker.
Here's the stripped-down relevant code:
try {
worker = await Tesseract.createWorker('eng', 1, {
corePath: chrome.runtime.getURL('tesseract/tesseract-core.wasm.js'),
workerPath: chrome.runtime.getURL('tesseract/worker.min.js'),
langPath: chrome.runtime.getURL('tesseract/lang/'),
workerBlobURL: false,
logger: m => console.log(m),
});
} catch (regularWorkerError) {
console.warn('Regular worker failed, trying Blob fallback:', regularWorkerError);
try {
const blobURL = await createWorkerBlob();
worker = await Tesseract.createWorker('eng', 1, {
corePath: chrome.runtime.getURL('tesseract/tesseract-core.wasm.js'),
workerPath: blobURL,
langPath: chrome.runtime.getURL('tesseract/lang/'),
logger: m => console.log(m),
});
} catch (blobWorkerError) {
console.error('Both methods failed:', blobWorkerError);
throw new Error('OCR unavailable');
}
}
async function createWorkerBlob() {
const res = await fetch(chrome.runtime.getURL('tesseract/worker.min.js'));
const blob = await res.blob();
return URL.createObjectURL(blob);
}
In manifest.json (MV3):
"web_accessible_resources": [
{
"resources": [
"tesseract/*",
"tesseract/lang/*",
"worker.min.js",
"tesseract-core.wasm.js",
"eng.traineddata"
],
"matches": ["<all_urls>"]
}
]
I get the following errors:
1. Regular worker (direct URL) fails with:
Failed to construct 'Worker':
Script at 'chrome-extension://.../tesseract/worker.min.js' cannot be accessed from origin ''
2. Blob fallback fails with:
Refused to create a worker from 'blob:https://us-east-1.console.aws.amazon.com/...'
because it violates the following Content Security Policy directive: "script-src ..."
Any practical examples or workarounds would be super helpful 🙏
Thanks!
My extension structure includes:
tesseract/worker.min.js
tesseract/tesseract-core.wasm.js
tesseract.min.js
tesseract/lang/eng.traineddata
offscreen.js:
chrome.runtime.onMessage.addListener(async (message, sender, sendResponse) => {
if (message.type === 'RUN_OCR') {
const { imageURL } = message;
const { createWorker } = Tesseract;
const worker = await createWorker('eng', 1, {
workerPath: await createWorkerBlob(),
langPath: chrome.runtime.getURL('tesseract/lang/'),
corePath: chrome.runtime.getURL('tesseract/tesseract-core.wasm.js'),
workerBlobURL: false,
});
await worker.setParameters({
tessedit_pageseg_mode: 11,
tessedit_ocr_engine_mode: 1,
preserve_interword_spaces: '1',
});
const result = await worker.recognize(imageURL, {}, { tsv: true });
await worker.terminate();
sendResponse({ success: true, result: result.data.tsv });
return true;
}
});
async function createWorkerBlob() {
const response = await fetch(chrome.runtime.getURL('tesseract/worker.min.js'));
const blob = await response.blob();
return URL.createObjectURL(blob);
}
However, I get this error when the offscreen page tries to create the worker:
How can I correctly load the Tesseract WebAssembly core file (tesseract-core.wasm.js) in an offscreen document given that it's accessible via chrome.runtime.getURL(...) but blocked by the extension’s CSP when used in importScripts()?
Any workaround or best practice to get this running without using remote resources?
Thanks!
--
You received this message because you are subscribed to the Google Groups "Chromium Extensions" group.
To unsubscribe from this group and stop receiving emails from it, send an email to chromium-extens...@chromium.org.
To view this discussion visit https://groups.google.com/a/chromium.org/d/msgid/chromium-extensions/094ede13-b9ec-4db0-8ca2-a869f8c145d8n%40chromium.org.
I finally found a reliable solution by combining two approaches, and thought I'd share it in case anyone else may need it in the future:
Using an offscreen document to safely execute OCR.
Serving all Tesseract.js files locally from the extension.
Place the necessary Tesseract dist files into your extension folder. Example:
/scripts/
- tesseract.min.js
- tesser...@v5.0.4_dist_worker.min.js
- tesseract-core.wasm.js
- /languages/
- eng.traineddata
2. offscreen.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
</head>
<body>
<script src="scripts/tesseract.min.js"></script>
<script type="module" src="offscreen.js"></script>
</body>
</html>
3. offscreen.js
chrome.runtime.onMessage.addListener(async (message, sender, sendResponse) => {
if (message.type === 'RUN_OCR') {
try {
const { imageURL } = message;
const { createWorker } = Tesseract;
const worker = await createWorker('eng', 1, {
workerPath: chrome.runtime.getURL("scripts/tesser...@v5.0.4_dist_worker.min.js"),
corePath: chrome.runtime.getURL("scripts/"),
langPath: chrome.runtime.getURL("scripts/languages/"),
workerBlobURL: false,
logger: m => console.log(m),
});
await worker.setParameters({
tessedit_pageseg_mode: 11,
tessedit_ocr_engine_mode: 1,
preserve_interword_spaces: '1',
});
const result = await worker.recognize(imageURL, {}, { tsv: true });
await worker.terminate();
chrome.runtime.sendMessage({
type: 'OCR_RESULT',
data: {
success: true,
result: result.data.tsv
}
});
} catch (err) {
sendResponse({ success: false, error: err.message });
}
return true;
}
});