如何将输入对象传递给webworker,以便它可以从文件Javascript中读取切片



所以我用

创建了一个输入对象
var s_curFile;
function JSprocessFilePicker( input )
{
let url = input.value;
let ext = url.substring( url.lastIndexOf( '.' ) + 1 ).toLowerCase();
if ( input.files && input.files[0] && ( ext == "txt" ) )
{
s_curFile = input.files[0];
//TODO send s_curFile to workers
}
}
var input = document.createElement( "input" );
input.setAttribute( "id", "file_picker" );
input.setAttribute( "type", "file" );
input.setAttribute( "accept", ".txt" );
input.setAttribute( "onchange", "JSprocessFilePicker(this)" );
input.click();

我想将s_curFile发送到web worker,以便我可以同时在主线程和worker上使用XMLHTTPRequest读取切片,如:

//on both worker and main thread
let xhrReq = new XMLHttpRequest();
xhrReq.overrideMimeType('text/plain; charset=x-user-defined');
//qwOffset and hSize are determined on the thread
let uri = URL.createObjectURL(s_curFile.slice(qwOffset, qwOffset + hSize));
xhrReq.open('GET', uri, false); //can i make it async on workers?
xhrReq.send();
URL.revokeObjectURL(uri);
let Idx;
let sz = xhrReq.response.length;
for (Idx = 0; Idx < sz; ++Idx) {
//do stuff with response
}

我只是在读文件。那么,我该如何将s_curFile发送给工人呢?我认为你必须使用SharedArrayBuffer从主线程到工作人员使用.postMessage(...),但是我将如何填充缓冲区?或者有另一种方法来做到这一点,因为我相当确定XMLHttpRequest可以从工人完成。(我需要这个功能,因为用户可以拥有的本地文件的大小是30 GB以上,所以由于每个选项卡内存限制,我不能把它全部放在内存中,我希望工作人员帮助处理大量的数据)

您可以简单地postMessage()您的File对象。底层数据不会被复制,只会复制包装器对象。

但是请注意,读取文件时不应该使用XMLHttpRequest。在旧的浏览器中,您将使用FileReader(甚至在Web Workers中使用FileReaderSync)和它们的.readAsText()方法。在最近的浏览器中,你可以使用File.text()方法,它返回一个承诺解析,内容读取为UTF-8文本。

然而,要将文本文件作为块读取,您需要处理多字节字符。在中间分割这样的字符会破坏它:

(async () => {
const file = new File(["😱"], "file.txt");
const chunk1 = file.slice(0, file.size/2);
const chunk2 = file.slice(file.size/2);
const txt1 = await chunk1.text();
const txt2 = await chunk2.text();
const all  = await file.text();
console.log({txt1, txt2, all});
})();

为了规避这种情况,您需要使用TextDecoder,它能够在内存中保留信息的最后一个字节,以便能够重建正确的字符,这要归功于.decode()方法中可用的stream选项。

(async () => {
const file = new File(["😱"], "file.txt");
const decoder = new TextDecoder();
const chunk1 = file.slice(0, file.size/2);
const chunk2 = file.slice(file.size/2);
const txt1 = decoder.decode(await chunk1.arrayBuffer(), { stream: true});
const txt2 = decoder.decode(await chunk2.arrayBuffer(), { stream: true});
const all  = await file.text();
// now txt1 is empty and txt2 contains the whole glyph
console.log({txt1, txt2, all});
})();

但是TextDecoders不能在worker之间共享,所以它们不能真正帮助我们处理分割文件到不同worker时可能面临的分块问题。不幸的是,我不知道这种情况下有一个简单的解决方案,所以这是你的决定,如果速度的增加是值得打破几个字符的风险,我知道,在我的地区,这个风险不能承担,因为大多数字符都是有关的。

无论如何,这里有一个解决方案,它确实承担了这个风险,并将您的文件拆分为尽可能多的可用CPU内核,每个内核将自己的块作为流处理,并返回它找到的" a "s的数量。

const inp = document.querySelector("input");
// limit our number of parallel Workers to the number of cores - 1 (for UI)
const availableThreads = navigator.hardwareConcurrency - 1;
const workerUrl = buildWorkerURL();
const workers = Array.from({length: availableThreads}, () => new Worker(workerUrl));
inp.addEventListener("change", async (evt) => {
const file = inp.files[0];
if (!file.name.endsWith(".txt")) {
console.log("not a .txt file");
return;
}
const chunkSize = Math.ceil(file.size / workers.length);
const numberOfAs = (await Promise.all(workers.map((worker, i) => {
return new Promise((res, rej) => {
// we use a MessageChannel to be able to promisify the request to the Worker
// this way we can handle different parallel requests
const { port1, port2 } = new MessageChannel();
worker.onerror = rej;
port2.onmessage = ({data}) => {
if(isNaN(data)) {
// You could handle progress events here if you wish
rej(data);
}
res(data);
};
// we send only a chunk for convenience
// the actual data never moves anyway
const chunk = file.slice(chunkSize * i, chunkSize * (i + 1));
worker.postMessage(chunk, [port1]);
});
})))
// each worker sent its own count, we have to do the sum here
.reduce((a, b) => a + b, 0);
console.log(`The file ${file.name} contains ${numberOfAs} "A"s`);
});

function buildWorkerURL() {
const scriptContent = document.querySelector("script[type=worker]").textContent;
const blob = new Blob([scriptContent], {type: "text/javascript"});
return URL.createObjectURL(blob);
}
<input type=file>
<!-- our worker script -->
<script type=worker>
onmessage = ({data, ports}) => {
let found = 0;
const stream = data.stream();
const reader = stream.getReader();
const decoder = new TextDecoder();
reader.read().then(processChunk);

function processChunk({done, value}) {
// 'value' is an Uint8Array
// we decode it as UTF-8 text, with the 'stream' option
const chunk = decoder.decode(value, { stream: true });
// do some processing over the chunk of text
// be careful to NOT leak the data here
found += (chunk.match(/(a|A)/g)||"").length;
if (done) {
// use the sent MessagePort to be able to "promisify"
// the whole process
ports[0].postMessage(found);
}
else {
// do it again
reader.read().then(processChunk);
}
}
};
</script>

最新更新