From cf1148b879e498ecca8c9943f1574d034675db9e Mon Sep 17 00:00:00 2001 From: Sai1919 Date: Mon, 7 Nov 2016 01:54:36 +0530 Subject: [PATCH] first commit: add entire code base --- package.json | 40 ++ parser.js | 155 +++++ parserState.js | 12 + test/TestFiles/corrupted.xml | 11 + test/TestFiles/item.xml | 12 + test/TestFiles/manyItems.xml | 1195 ++++++++++++++++++++++++++++++++++ test/TestFiles/medium.xml | 44 ++ test/test.js | 84 +++ 8 files changed, 1553 insertions(+) create mode 100644 package.json create mode 100644 parser.js create mode 100644 parserState.js create mode 100644 test/TestFiles/corrupted.xml create mode 100644 test/TestFiles/item.xml create mode 100644 test/TestFiles/manyItems.xml create mode 100644 test/TestFiles/medium.xml create mode 100644 test/test.js diff --git a/package.json b/package.json new file mode 100644 index 0000000..819a45b --- /dev/null +++ b/package.json @@ -0,0 +1,40 @@ +{ + "version": "0.0.1", + "name": "xml-streamer", + "description": "XML stream parser for parsing large files efficiently with less usage of memory.", + "author": { + "name": "Sai Teja", + "email": "saitejas464@gmail.com" + }, + "keywords": [ + "xml", + "xml streaming", + "xml streamer", + "streaming", + "xml parser", + "xml parsing", + "xml2js", + "xmltojs" + ], + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/Sai1919/xml-streamer" + }, + "dependencies": { + "node-expat": "2.3.15", + "lodash": "4.16.6" + }, + "devDependencies": { + "mocha": "3.1.2", + "should": "11.1.1" + }, + "optionalDependencies": {}, + "main": "./parser", + "maintainers": [ + { + "name": "Sai Teja", + "email": "saitejas464@gmail.com" + } + ] +} diff --git a/parser.js b/parser.js new file mode 100644 index 0000000..3281839 --- /dev/null +++ b/parser.js @@ -0,0 +1,155 @@ +var expat = require('node-expat') +var _ = require('lodash') +var ParserState = require('./parserState') + +function XmlParser (xmlStream, opts) { + this.opts = opts || {} + this.parserState = new ParserState() + this.parser = new expat.Parser('UTF-8') + var scope = this + this.parser.pause = function () { + xmlStream.pause() + scope.parser.stop() + } + this.parser.restart = function () { + scope.parser.resume() + xmlStream.resume() + } + process.nextTick(function () { + parse.call(scope, xmlStream) + }) + return this.parser +} + +function parse (xmlStream) { + if (!this.opts.resourcePath) this.parser.emit('error', new Error('resourcePath missing')) + var scope = this + var parser = scope.parser + var state = this.parserState + var lastIndex + var resourcePath = this.opts.resourcePath + + parser.on('startElement', function (name, attrs) { + if (state.isRootNode) validateResourcePath(name) + state.currentPath = state.currentPath + '/' + name + checkForResourcePath(name) + if (state.isPathfound) processStartElement(name, attrs) + }) + + parser.on('endElement', function (name) { + state.lastEndedNode = name + lastIndex = state.currentPath.lastIndexOf('/' + name) + state.currentPath = state.currentPath.substring(0, lastIndex) + if (state.isPathfound) processEndElement(name) + checkForResourcePath(name) + }) + + parser.on('text', function (text) { + if (state.isPathfound) processText(text) + }) + + parser.on('end', function () { + parser.emit('finish') + }) + + function processStartElement (name, attrs) { + if (!name) return + var obj = {} + if (attrs && !_.isEmpty(attrs)) obj.$ = attrs + var tempObj = state.object + var path = getRelativePath(name) + if (!path) { + if (attrs && !_.isEmpty(attrs)) state.object.$ = attrs + return + } + var tokens = path.split('.') + + for (var i = 0; i < tokens.length; i++) { + if (tempObj[tokens[i]]) { + tempObj = tempObj[tokens[i]] + } else { + tempObj[tokens[i]] = [] + tempObj = tempObj[tokens[i]] + } + if (Array.isArray(tempObj) && i !== tokens.length - 1) tempObj = tempObj[tempObj.length - 1] + } + tempObj.push(obj) + } + + function processEndElement (name) { + var index = resourcePath.lastIndexOf('/') + var rpath = resourcePath.substring(0, index) + + if (rpath === state.currentPath) { + if (scope.opts.emitEventsOnNodeName) parser.emit(name, state.object) + parser.emit('data', state.object) + state.object = {} + } + } + + function processText (text) { + if (!text || !/\S/.test(text)) { + return + } + var path = getRelativePath() + var tempObj = state.object + if (!path) { + if (!state.object._) state.object._ = '' + state.object._ = state.object._ + text + return + } + var tokens = path.split('.') + for (var i = 0; i < tokens.length; i++) { + if (tempObj[tokens[i]]) { + tempObj = tempObj[tokens[i]] + } else { + tempObj[tokens[i]] = [] + tempObj = tempObj[tokens[i]] + } + if (Array.isArray(tempObj) && i !== tokens.length - 1) tempObj = tempObj[tempObj.length - 1] + } + var obj = tempObj[tempObj.length - 1] + if (!obj._) obj._ = '' + obj._ = obj._ + text + } + + function checkForResourcePath (name) { + if (state.currentPath.indexOf(resourcePath) === 0) { + state.isPathfound = true + } else { + state.isPathfound = false + } + } + + function getRelativePath () { + var xpath = state.currentPath.substring(resourcePath.length) + + if (!xpath) return + if (xpath[0] === '/') xpath = xpath.substring(1) + var tokens = xpath.split('/') + var jsonPath = tokens.join('.') + return jsonPath + } + + function validateResourcePath (name) { + var temp + var index + + state.isRootNode = false + + if (resourcePath[0] === '/') { + temp = resourcePath.substring(1, resourcePath.length) + } else { + temp = resourcePath + } + index = temp.indexOf('/') + temp = temp.substring(0, index) + + if (temp !== name) { + xmlStream.end() + } + } +} + +module.exports = XmlParser + diff --git a/parserState.js b/parserState.js new file mode 100644 index 0000000..5cd45b6 --- /dev/null +++ b/parserState.js @@ -0,0 +1,12 @@ + +function ParserState () { + this.currentPath = '' + this.lastEndedNode = '' + this.isPathfound = false + this.object = {} + this.buffer = [] + this.paused = false + this.isRootNode = true +} + +module.exports = ParserState diff --git a/test/TestFiles/corrupted.xml b/test/TestFiles/corrupted.xml new file mode 100644 index 0000000..88dfb2f --- /dev/null +++ b/test/TestFiles/corrupted.xml @@ -0,0 +1,11 @@ + + + + one + two + + three + four + five + + diff --git a/test/TestFiles/item.xml b/test/TestFiles/item.xml new file mode 100644 index 0000000..0e95f17 --- /dev/null +++ b/test/TestFiles/item.xml @@ -0,0 +1,12 @@ + + + + one + two + + + three + four + five + + diff --git a/test/TestFiles/manyItems.xml b/test/TestFiles/manyItems.xml new file mode 100644 index 0000000..e4e09a2 --- /dev/null +++ b/test/TestFiles/manyItems.xml @@ -0,0 +1,1195 @@ + + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + \ No newline at end of file diff --git a/test/TestFiles/medium.xml b/test/TestFiles/medium.xml new file mode 100644 index 0000000..e3eb5a8 --- /dev/null +++ b/test/TestFiles/medium.xml @@ -0,0 +1,44 @@ + + + + one + two + + + three + four + five + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + + one + two + + \ No newline at end of file diff --git a/test/test.js b/test/test.js new file mode 100644 index 0000000..986c9b8 --- /dev/null +++ b/test/test.js @@ -0,0 +1,84 @@ +var should = require('should') +var fs = require('fs') + +var ParserFactory = require('../parser') + +describe('Tests', function () { + describe('simple behaviour testing', function () { + it('should properly parse a simple file.', function (done) { + var xmlStream = fs.createReadStream('./test/TestFiles/item.xml') + var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'}) + var expectedData = [ + { '$': { id: '1', test: 'hello' }, + subitem: + [ { '$': { sub: 'TESTING SUB' }, _: 'one' }, + { '$': { sub: '2' }, _: 'two' } ] }, + { '$': { id: '2' }, + subitem: [ { _: 'three' }, { _: 'four' }, { _: 'five' } ] } ] + var actualData = [] + var dataEventCount = 0 + + parser.on('data', function (data) { + actualData.push(data) + dataEventCount++ + }) + + parser.on('error', function (err) { + done(err) + }) + + parser.on('end', function () { + // console.log('actualData=', actualData) + // console.log('dataEventCount=', dataEventCount) + actualData.should.deepEqual(expectedData) + dataEventCount.should.equal(2) + done() + }) + xmlStream.pipe(parser) + }) + + it('should properly parse a medium size file.', function (done) { + var xmlStream = fs.createReadStream('./test/TestFiles/medium.xml') + var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'}) + + var dataEventCount = 0 + + parser.on('data', function (data) { + dataEventCount++ + }) + + parser.on('error', function (err) { + done(err) + }) + + parser.on('end', function () { + // console.log('dataEventCount=', dataEventCount) + dataEventCount.should.equal(10) + done() + }) + xmlStream.pipe(parser) + }) + + it('should properly parse a file containing many nodes.', function (done) { + var xmlStream = fs.createReadStream('./test/TestFiles/manyItems.xml') + var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'}) + + var dataEventCount = 0 + + parser.on('data', function (data) { + dataEventCount++ + }) + + parser.on('error', function (err) { + done(err) + }) + + parser.on('end', function () { + // console.log('dataEventCount=', dataEventCount) + dataEventCount.should.equal(296) + done() + }) + xmlStream.pipe(parser) + }) + }) +})