stream入和来自外部程序期望与JavaScript nodejs的文件

问题

我需要上传数百个PDF文档,将它们转换为HTML,然后将HTML存储在MongoDB中。 我目前正在保存传入的PDF文档并将其转换为文件系统中的HTML。 有没有办法使用stream来避免所有的文件I / O?

目前的做法(工作但速度慢)

我在用:

  1. 店员阅读上传的PDF文件,我保存到文件系统。
  2. 我在node.js中创build了一个“exec”subprocess,它调用“'pdftohtml -c -s -noframes -nodrm'+ inputFileNamePDF +''+ outputFileNameHTML”。 HTML输出文件被保存到文件系统。
  3. 然后我遍历所有的HTML文件来创build一个Bulk upsert到MongoDB。

理想情况下,我想直接上传PDF文件到“inputFileNamePDF”。 然后将转换后的“outputFileNameHTML”stream式传输到批量upsert。

代码如下:

var path = require("path"), Busboy = require('busboy') http = require('http'), util = require('util'), fs = require('fs-extra'), pdftohtml = require('pdftohtmljs'), exec =require('child_process').exec, pdf_extract = require('pdf-extract'), exports.postUpload = function (req, res) { // parse a file upload var fileName = ""; var uploadDir = '/tmp/' + res.locals.user._doc.email.replace(/[@\.]/g,"_"); var infiles = 0, outfiles = 0, done = false, busboy = new Busboy({ headers: req.headers }); console.log('Start parsing form ...'); busboy.on('file', function (fieldname, file, filename) { ++infiles; console.log("file event #" + infiles); onFile(fieldname, file, filename, function () { ++outfiles; console.log("file #" + infiles + " written."); if (done) console.log(outfiles + '/' + infiles + ' parts written to disk'); if (done && infiles === outfiles) { // ACTUAL EXIT CONDITION console.log('All parts written to disk'); res.writeHead(200, { 'Connection': 'close' }); res.end("That's all folks!"); convertToHTMLTxt(); } }); }); busboy.on('finish', function () { console.log('Done parsing form!'); done = true; }); req.pipe(busboy); function onFile(fieldname, file, filename, next) { // or save at some other location var fileName = ""; fileName = filename.replace( /[^a-z0-9_\-]/gi,"_"); fileName = fileName.replace(/_(pdf|docx|doc)$/i,".$1"); var fstream = fs.createWriteStream(path.join(uploadDir, fileName)); file.on('end', function () { console.log(fieldname + '(' + fileName + ') EOF'); }); fstream.on('close', function () { console.log(fieldname + '(' + fileName + ') written to disk'); next(); }); console.log(fieldname + '(' + fileName + ') start saving'); file.pipe(fstream); } function convertToHTMLTxt () { var execTxt, execHTML, execPDF; var textDir = 'text'; var htmlDir = 'html'; console.log('Directory: ', uploadDir); fs.readdir(uploadDir, function(err, files) { if (err) { console.log('error reading directory: ', uploadDir); return; } files.forEach(function(fileName) { var fileNameHTML = path.join(uploadDir, htmlDir, fileName.replace(/(pdf|docx|doc)$/i,"html")); var fileNamePDF = path.join(uploadDir, fileName); if (fileName.match(/pdf$/i)) { execPDF = exec('pdftohtml -c -s -noframes -nodrm ' + fileNamePDF + ' ' + fileNameHTML, function(error, stdout, stderr) { console.log('stdout: ', stdout); console.log('stderr: ', stderr); if (error !== null) { console.log('exec error: ', error); } }); execPDF.on('close', function (code) { console.log('******** PDF to HTML Conversion complete - exit code ' + code); }); } }) }); 

一旦转换完成,我遍历所有的HTML文件,并做一个MongoDB批量upsert:

  fs.readFile(fileNameHTML, 'utf8', function (err, HTMLData) { if (err) { console.log('error reading file: ', fileNameHTML + '/nerror: ' + err); callback(err); return; } bulk.find({ userName: userName, docName : fileName }).upsert() .updateOne({userName: userName, docName : fileName, HTMLData : HTMLData});