diff --git a/src/routes/_utils/runTesseract.js b/src/routes/_utils/runTesseract.js index bb93cfe8..a4f48411 100644 --- a/src/routes/_utils/runTesseract.js +++ b/src/routes/_utils/runTesseract.js @@ -1,5 +1,7 @@ import { importTesseractWorker } from '../_utils/asyncModules' +const DESTROY_WORKER_DELAY = 300000 // 5 minutes + // TODO: it's flaky to try to estimate tesseract's total progress this way const steps = [ { status: 'loading tesseract core', proportion: 0.05 }, @@ -9,6 +11,36 @@ const steps = [ { status: 'recognizing text', proportion: 0.6 } ] +let worker +let destroyWorkerHandle + +async function initWorker () { + if (!worker) { + worker = (await importTesseractWorker())() + } +} + +function destroyWorker () { + console.log('destroying tesseract worker') + if (worker) { + worker.terminate() + worker = null + } +} + +// destroy the worker after a delay to reduce memory usage +function scheduleDestroyWorker () { + cancelDestroyWorker() + destroyWorkerHandle = setTimeout(destroyWorker, DESTROY_WORKER_DELAY) +} + +function cancelDestroyWorker () { + if (destroyWorkerHandle) { + clearTimeout(destroyWorkerHandle) + destroyWorkerHandle = null + } +} + function getTotalProgress (progressInfo) { const idx = steps.findIndex(({ status }) => progressInfo.status === status) let total = 0 @@ -19,9 +51,7 @@ function getTotalProgress (progressInfo) { return total } -export async function runTesseract (url, onProgress) { - const worker = await importTesseractWorker() - +function recognize (url, onProgress) { // TODO: have to trick tesseract into not creating a blob URL because that would break our CSP // see https://github.com/naptha/tesseract.js/pull/322 let promise @@ -38,6 +68,16 @@ export async function runTesseract (url, onProgress) { onProgress(getTotalProgress(progressInfo)) } }) - const res = await promise - return res.text + return promise +} + +export async function runTesseract (url, onProgress) { + cancelDestroyWorker() + await initWorker() + try { + const { text } = await recognize(url, onProgress) + return text + } finally { + scheduleDestroyWorker() + } } diff --git a/src/routes/_utils/tesseractWorker.js b/src/routes/_utils/tesseractWorker.js index 543f0155..8bb34667 100644 --- a/src/routes/_utils/tesseractWorker.js +++ b/src/routes/_utils/tesseractWorker.js @@ -12,10 +12,9 @@ import { TesseractWorker } from 'tesseract.js' // which seems excessive. So we just live with the bug for now. // https://github.com/naptha/tesseract.js/issues/325 const { origin } = location -const tesseractWorker = new TesseractWorker({ + +export default () => new TesseractWorker({ workerPath: `${origin}/${workerPath}`, langPath: `${origin}/`, corePath: `${origin}/${corePath}` }) - -export default tesseractWorker