本地PDF文件在node.js中抓取

我已经通过使用fs的MEAN堆栈web应用程序上传了pdf。 我想从PDF中提取某些领域,并显示在Web应用程序。 我已经看了几个NPM包如pdf.js,pdf2json。 我无法弄清楚可用的示例中使用的文档和JavaScriptcallback。 请帮忙!

我希望我能帮你回答你的问题。 使用pdf2json可以用来parsingpdf并提取文本。 有几个步骤,需要采取行动。 我已经调整了从https://github.com/modesty/pdf2json的例子。

设置是在节点应用程序中安装pdf2json,并且也是下划线。 示例页面没有解释需要定义自己的callback函数。 它也使用self而不是注册它们。 所以,通过适当的修改,从pdf中提取所有文本的代码将如下所示:

 // Get the dependencies that have already been installed // to ./node_modules with `npm install <dep>`in the root director // of your app var _ = require('underscore'), PDFParser = require('pdf2json'); var pdfParser = new PDFParser(); // Create a function to handle the pdf once it has been parsed. // In this case we cycle through all the pages and extraxt // All the text blocks and print them to console. // If you do `console.log(JSON.stringify(pdf))` you will // see how the parsed pdf is composed. Drill down into it // to find the data you are looking for. var _onPDFBinDataReady = function (pdf) { console.log('Loaded pdf:\n'); for (var i in pdf.data.Pages) { var page = pdf.data.Pages[i]; for (var j in page.Texts) { var text = page.Texts[j]; console.log(text.R[0].T); } } }; // Create an error handling function var _onPDFBinDataError = function (error) { console.log(error); }; // Use underscore to bind the data ready function to the pdfParser // so that when the data ready event is emitted your function will // be called. As opposed to the example, I have used `this` instead // of `self` since self had no meaning in this context pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this)); // Register error handling function pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this)); // Construct the file path of the pdf var pdfFilePath = 'test3.pdf'; // Load the pdf. When it is loaded your data ready function will be called. pdfParser.loadPDF(pdfFilePath); 

我正在运行我的服务器端控制器的代码。

 module.exports = (function() { return { add: function(req, res) { var tmp_path = req.files.pdf.path; var target_path = './uploads/' + req.files.pdf.name; fs.rename(tmp_path, target_path, function(err) { if (err) throw err; // delete the temporary file, so that the explicitly set temporary upload dir does not get filled with unwanted files fs.unlink(tmp_path, function() { if (err) throw err; //edit here pdf parser res.redirect('#/'); }); }) }, show: function(req, res) { var pdfParser = new PDFParser(); var _onPDFBinDataReady = function (pdf) { console.log('Loaded pdf:\n'); for (var i in pdf.data.Pages) { var page = pdf.data.Pages[i]; // console.log(page.Texts); for (var j in page.Texts) { var text = page.Texts[j]; // console.log(text.R[0].T); } } console.log(JSON.stringify(pdf)); }; // Create an error handling function var _onPDFBinDataError = function (error) { console.log(error); }; pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this)); // Register error handling function pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this)); // Construct the file path of the pdf var pdfFilePath = './uploads/Invoice_template.pdf'; // Load the pdf. When it is loaded your data ready function will be called. pdfParser.loadPDF(pdfFilePath); }, //end controller 

}