diff --git a/package.json b/package.json
new file mode 100644
index 0000000..819a45b
--- /dev/null
+++ b/package.json
@@ -0,0 +1,40 @@
+{
+ "version": "0.0.1",
+ "name": "xml-streamer",
+ "description": "XML stream parser for parsing large files efficiently with less usage of memory.",
+ "author": {
+ "name": "Sai Teja",
+ "email": "saitejas464@gmail.com"
+ },
+ "keywords": [
+ "xml",
+ "xml streaming",
+ "xml streamer",
+ "streaming",
+ "xml parser",
+ "xml parsing",
+ "xml2js",
+ "xmltojs"
+ ],
+ "license": "MIT",
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/Sai1919/xml-streamer"
+ },
+ "dependencies": {
+ "node-expat": "2.3.15",
+ "lodash": "4.16.6"
+ },
+ "devDependencies": {
+ "mocha": "3.1.2",
+ "should": "11.1.1"
+ },
+ "optionalDependencies": {},
+ "main": "./parser",
+ "maintainers": [
+ {
+ "name": "Sai Teja",
+ "email": "saitejas464@gmail.com"
+ }
+ ]
+}
diff --git a/parser.js b/parser.js
new file mode 100644
index 0000000..3281839
--- /dev/null
+++ b/parser.js
@@ -0,0 +1,155 @@
+var expat = require('node-expat')
+var _ = require('lodash')
+var ParserState = require('./parserState')
+
+function XmlParser (xmlStream, opts) {
+ this.opts = opts || {}
+ this.parserState = new ParserState()
+ this.parser = new expat.Parser('UTF-8')
+ var scope = this
+ this.parser.pause = function () {
+ xmlStream.pause()
+ scope.parser.stop()
+ }
+ this.parser.restart = function () {
+ scope.parser.resume()
+ xmlStream.resume()
+ }
+ process.nextTick(function () {
+ parse.call(scope, xmlStream)
+ })
+ return this.parser
+}
+
+function parse (xmlStream) {
+ if (!this.opts.resourcePath) this.parser.emit('error', new Error('resourcePath missing'))
+ var scope = this
+ var parser = scope.parser
+ var state = this.parserState
+ var lastIndex
+ var resourcePath = this.opts.resourcePath
+
+ parser.on('startElement', function (name, attrs) {
+ if (state.isRootNode) validateResourcePath(name)
+ state.currentPath = state.currentPath + '/' + name
+ checkForResourcePath(name)
+ if (state.isPathfound) processStartElement(name, attrs)
+ })
+
+ parser.on('endElement', function (name) {
+ state.lastEndedNode = name
+ lastIndex = state.currentPath.lastIndexOf('/' + name)
+ state.currentPath = state.currentPath.substring(0, lastIndex)
+ if (state.isPathfound) processEndElement(name)
+ checkForResourcePath(name)
+ })
+
+ parser.on('text', function (text) {
+ if (state.isPathfound) processText(text)
+ })
+
+ parser.on('end', function () {
+ parser.emit('finish')
+ })
+
+ function processStartElement (name, attrs) {
+ if (!name) return
+ var obj = {}
+ if (attrs && !_.isEmpty(attrs)) obj.$ = attrs
+ var tempObj = state.object
+ var path = getRelativePath(name)
+ if (!path) {
+ if (attrs && !_.isEmpty(attrs)) state.object.$ = attrs
+ return
+ }
+ var tokens = path.split('.')
+
+ for (var i = 0; i < tokens.length; i++) {
+ if (tempObj[tokens[i]]) {
+ tempObj = tempObj[tokens[i]]
+ } else {
+ tempObj[tokens[i]] = []
+ tempObj = tempObj[tokens[i]]
+ }
+ if (Array.isArray(tempObj) && i !== tokens.length - 1) tempObj = tempObj[tempObj.length - 1]
+ }
+ tempObj.push(obj)
+ }
+
+ function processEndElement (name) {
+ var index = resourcePath.lastIndexOf('/')
+ var rpath = resourcePath.substring(0, index)
+
+ if (rpath === state.currentPath) {
+ if (scope.opts.emitEventsOnNodeName) parser.emit(name, state.object)
+ parser.emit('data', state.object)
+ state.object = {}
+ }
+ }
+
+ function processText (text) {
+ if (!text || !/\S/.test(text)) {
+ return
+ }
+ var path = getRelativePath()
+ var tempObj = state.object
+ if (!path) {
+ if (!state.object._) state.object._ = ''
+ state.object._ = state.object._ + text
+ return
+ }
+ var tokens = path.split('.')
+ for (var i = 0; i < tokens.length; i++) {
+ if (tempObj[tokens[i]]) {
+ tempObj = tempObj[tokens[i]]
+ } else {
+ tempObj[tokens[i]] = []
+ tempObj = tempObj[tokens[i]]
+ }
+ if (Array.isArray(tempObj) && i !== tokens.length - 1) tempObj = tempObj[tempObj.length - 1]
+ }
+ var obj = tempObj[tempObj.length - 1]
+ if (!obj._) obj._ = ''
+ obj._ = obj._ + text
+ }
+
+ function checkForResourcePath (name) {
+ if (state.currentPath.indexOf(resourcePath) === 0) {
+ state.isPathfound = true
+ } else {
+ state.isPathfound = false
+ }
+ }
+
+ function getRelativePath () {
+ var xpath = state.currentPath.substring(resourcePath.length)
+
+ if (!xpath) return
+ if (xpath[0] === '/') xpath = xpath.substring(1)
+ var tokens = xpath.split('/')
+ var jsonPath = tokens.join('.')
+ return jsonPath
+ }
+
+ function validateResourcePath (name) {
+ var temp
+ var index
+
+ state.isRootNode = false
+
+ if (resourcePath[0] === '/') {
+ temp = resourcePath.substring(1, resourcePath.length)
+ } else {
+ temp = resourcePath
+ }
+ index = temp.indexOf('/')
+ temp = temp.substring(0, index)
+
+ if (temp !== name) {
+ xmlStream.end()
+ }
+ }
+}
+
+module.exports = XmlParser
+
diff --git a/parserState.js b/parserState.js
new file mode 100644
index 0000000..5cd45b6
--- /dev/null
+++ b/parserState.js
@@ -0,0 +1,12 @@
+
+function ParserState () {
+ this.currentPath = ''
+ this.lastEndedNode = ''
+ this.isPathfound = false
+ this.object = {}
+ this.buffer = []
+ this.paused = false
+ this.isRootNode = true
+}
+
+module.exports = ParserState
diff --git a/test/TestFiles/corrupted.xml b/test/TestFiles/corrupted.xml
new file mode 100644
index 0000000..88dfb2f
--- /dev/null
+++ b/test/TestFiles/corrupted.xml
@@ -0,0 +1,11 @@
+
+
+-
+ one
+ two
+
-
+ three
+ four
+ five
+
+
diff --git a/test/TestFiles/item.xml b/test/TestFiles/item.xml
new file mode 100644
index 0000000..0e95f17
--- /dev/null
+++ b/test/TestFiles/item.xml
@@ -0,0 +1,12 @@
+
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+
diff --git a/test/TestFiles/manyItems.xml b/test/TestFiles/manyItems.xml
new file mode 100644
index 0000000..e4e09a2
--- /dev/null
+++ b/test/TestFiles/manyItems.xml
@@ -0,0 +1,1195 @@
+
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+
\ No newline at end of file
diff --git a/test/TestFiles/medium.xml b/test/TestFiles/medium.xml
new file mode 100644
index 0000000..e3eb5a8
--- /dev/null
+++ b/test/TestFiles/medium.xml
@@ -0,0 +1,44 @@
+
+
+-
+ one
+ two
+
+-
+ three
+ four
+ five
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+-
+ one
+ two
+
+
\ No newline at end of file
diff --git a/test/test.js b/test/test.js
new file mode 100644
index 0000000..986c9b8
--- /dev/null
+++ b/test/test.js
@@ -0,0 +1,84 @@
+var should = require('should')
+var fs = require('fs')
+
+var ParserFactory = require('../parser')
+
+describe('Tests', function () {
+ describe('simple behaviour testing', function () {
+ it('should properly parse a simple file.', function (done) {
+ var xmlStream = fs.createReadStream('./test/TestFiles/item.xml')
+ var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'})
+ var expectedData = [
+ { '$': { id: '1', test: 'hello' },
+ subitem:
+ [ { '$': { sub: 'TESTING SUB' }, _: 'one' },
+ { '$': { sub: '2' }, _: 'two' } ] },
+ { '$': { id: '2' },
+ subitem: [ { _: 'three' }, { _: 'four' }, { _: 'five' } ] } ]
+ var actualData = []
+ var dataEventCount = 0
+
+ parser.on('data', function (data) {
+ actualData.push(data)
+ dataEventCount++
+ })
+
+ parser.on('error', function (err) {
+ done(err)
+ })
+
+ parser.on('end', function () {
+ // console.log('actualData=', actualData)
+ // console.log('dataEventCount=', dataEventCount)
+ actualData.should.deepEqual(expectedData)
+ dataEventCount.should.equal(2)
+ done()
+ })
+ xmlStream.pipe(parser)
+ })
+
+ it('should properly parse a medium size file.', function (done) {
+ var xmlStream = fs.createReadStream('./test/TestFiles/medium.xml')
+ var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'})
+
+ var dataEventCount = 0
+
+ parser.on('data', function (data) {
+ dataEventCount++
+ })
+
+ parser.on('error', function (err) {
+ done(err)
+ })
+
+ parser.on('end', function () {
+ // console.log('dataEventCount=', dataEventCount)
+ dataEventCount.should.equal(10)
+ done()
+ })
+ xmlStream.pipe(parser)
+ })
+
+ it('should properly parse a file containing many nodes.', function (done) {
+ var xmlStream = fs.createReadStream('./test/TestFiles/manyItems.xml')
+ var parser = new ParserFactory(xmlStream, {resourcePath: '/items/item'})
+
+ var dataEventCount = 0
+
+ parser.on('data', function (data) {
+ dataEventCount++
+ })
+
+ parser.on('error', function (err) {
+ done(err)
+ })
+
+ parser.on('end', function () {
+ // console.log('dataEventCount=', dataEventCount)
+ dataEventCount.should.equal(296)
+ done()
+ })
+ xmlStream.pipe(parser)
+ })
+ })
+})