Files
xml-streamer/src/parser.ts
2025-07-01 23:21:56 +02:00

391 lines
9.3 KiB
TypeScript

import _ from 'lodash'
import stream from 'stream'
import util from 'util'
import { SaxLtx } from './ltx'
import { ParserState } from './parserState'
const defaults = {
resourcePath: '',
emitOnNodeName: false,
attrsKey: '$',
textKey: '_',
explicitArray: true,
verbatimText: false,
preserveWhitespace: false
}
export interface IXmlParserOptions {
/**
* Optional field. Used to extract the XML nodes that you are interested in.
*
* @type {string}
* @memberof IXmlParserOptions
*/
resourcePath?: string
/**
* Optional field. Set this to true if you want to listen on node names instead of data event. default: false
*
* @type {boolean}
* @memberof IXmlParserOptions
*/
emitOnNodeName?: boolean
/**
* Optional field. pass the value with which you want to reference attributes of a node in its object form. default: '$'
*
* @type {string}
* @memberof IXmlParserOptions
*/
attrsKey?: string
/**
* Optional field. pass the value with which you want to reference node value in its object form. default: '_'
*
* @type {string}
* @memberof IXmlParserOptions
*/
textKey?: string
/**
* Optional field. Default value is true. All children nodes will come in an array when this option is true.
*
* @type {boolean}
* @memberof IXmlParserOptions
*/
explicitArray?: boolean
/**
* Optional field. Default value is false. When set, text attribute will include all blanks found in xml.
* When unset, blanks are removed as long as they come in one expat single block (blank lines, newlines and entities).
*
* @type {boolean}
* @memberof IXmlParserOptions
*/
verbatimText?: boolean
preserveWhitespace?: boolean
}
export class XmlParser extends stream.Transform {
public parserState: ParserState
private opts: IXmlParserOptions
private _readableState: { objectMode: true; buffer: any }
private parser: SaxLtx
constructor(opts?: IXmlParserOptions) {
super()
this.opts = _.defaults(opts, defaults)
this.parserState = new ParserState()
this.parser = new SaxLtx()
this._readableState.objectMode = true
}
public _flush(callback: () => void) {
this.processChunk('')
callback()
}
public _transform(chunk: Buffer | string, encoding: string, callback: () => void) {
if (encoding !== 'buffer') {
this.emit('error', new Error('unsupported encoding'))
}
this.processChunk(chunk)
callback()
}
public parse(chunk: Buffer | string, cb: (error: Error, data?: Buffer) => void) {
const parser = this.parser
const state = this.parserState
let error
if (state.isRootNode) {
this.checkForInterestedNodeListeners()
registerEvents.call(this)
}
this.on('error', (err) => {
error = err
})
if (chunk.length === 0) {
parser.end()
this.emit('end')
this.removeAllListeners()
}
parser.write(chunk)
if (error) {
return cb(error)
}
const result = []
while (this._readableState.buffer.length > 0) {
result.push(this._readableState.buffer.consume())
}
return cb(null, result as any)
}
private processChunk(chunk: string | Buffer) {
const parser = this.parser
const state = this.parserState
if (state.isRootNode) {
this.checkForInterestedNodeListeners()
registerEvents.call(this)
}
parser.write(chunk)
}
private checkForInterestedNodeListeners() {
const ignore = ['end', 'prefinish', 'data', 'error']
const eventNames = Object.keys((this as any)._events)
// tslint:disable-next-line:prefer-for-of
for (let i = 0; i < eventNames.length; i++) {
if (_.includes(ignore, eventNames[i], 0)) {
continue
}
this.parserState.interestedNodes.push(eventNames[i])
}
}
}
function registerEvents() {
const scope = this
const parser: SaxLtx = this.parser
const state: ParserState = this.parserState
let lastIndex
const resourcePath = this.opts.resourcePath
const attrsKey = this.opts.attrsKey
const textKey = this.opts.textKey
const interestedNodes = state.interestedNodes
const explicitArray = this.opts.explicitArray
const verbatimText = this.opts.verbatimText
const preserveWhitespace = this.opts.preserveWhitespace
parser.on('startElement', (name, attrs) => {
if (state.isRootNode) {
state.isRootNode = false
}
state.currentPath = state.currentPath + '/' + name
checkForResourcePath(name)
if (state.isPathfound) {
processStartElement(name, attrs)
}
})
parser.on('endElement', (name) => {
state.lastEndedNode = name
lastIndex = state.currentPath.lastIndexOf('/' + name)
if (state.currentPath.substring(lastIndex + 1).indexOf('/') !== -1) {
processError.call(this, `mismatched tag`)
}
state.currentPath = state.currentPath.substring(0, lastIndex)
if (state.isPathfound) {
processEndElement(name)
}
checkForResourcePath(name)
})
parser.on('text', (text) => {
if (state.isPathfound) {
processText(text)
}
})
parser.on('error', function (err) {
processError.call(this, err)
})
function processStartElement(name: string, attrs: any) {
if (!name) {
return
}
const obj: any = {}
if (attrs && !_.isEmpty(attrs)) {
obj[attrsKey] = attrs
}
let tempObj = state.object
const path = getRelativePath(/*name*/)
if (!path) {
if (attrs && !_.isEmpty(attrs)) {
state.object[attrsKey] = attrs
}
return
}
const tokens = path.split('.')
for (let i = 0; i < tokens.length; i++) {
if (tempObj[tokens[i]] && !(explicitArray === false && i === tokens.length - 1)) {
tempObj = tempObj[tokens[i]]
} else {
// if explicitArray is true then create each node as array
// irrespective of how many nodes are there with same name.
tempObj[tokens[i]] = explicitArray ? [] : obj
tempObj = tempObj[tokens[i]]
}
if (Array.isArray(tempObj) && i !== tokens.length - 1) {
tempObj = tempObj[tempObj.length - 1]
}
}
if (Array.isArray(tempObj)) {
tempObj.push(obj)
}
}
function processEndElement(name: string) {
if (resourcePath) {
const index = resourcePath.lastIndexOf('/')
const rpath = resourcePath.substring(0, index)
if (rpath === state.currentPath) {
scope.push(state.object)
if (scope.opts.emitOnNodeName) {
scope.emit(name, state.object)
}
state.object = {}
}
} else {
if (_.includes(interestedNodes, name, 0)) {
emitInterestedNode(name)
if (state.firstFoundNode === name) {
state.object = {}
state.firstFoundNode = ''
state.isPathfound = false
}
}
}
}
function emitInterestedNode(name: string) {
let index
let xpath
let pathTokens
xpath = state.currentPath.substring(1)
pathTokens = xpath.split('/')
pathTokens.push(name)
index = pathTokens.indexOf(state.firstFoundNode)
pathTokens = _.drop(pathTokens, index + 1)
let tempObj = state.object
// tslint:disable-next-line:prefer-for-of
for (let i = 0; i < pathTokens.length; i++) {
tempObj = tempObj[pathTokens[i] as any]
}
if (Array.isArray(tempObj)) {
tempObj = tempObj[tempObj.length - 1]
}
scope.emit(name, tempObj)
scope.push(tempObj)
}
function processText(text: string) {
if (!text || (!verbatimText && !/\S/.test(text))) {
return
}
const path = getRelativePath()
let tempObj = state.object
if (!path) {
if (!state.object[textKey]) {
state.object[textKey] = ''
}
state.object[textKey] = state.object[textKey] + text
if (!preserveWhitespace) {
state.object[textKey] = state.object[textKey].replace(/\s+/g, ' ').trim()
}
return
}
const tokens = path.split('.')
for (let i = 0; i < tokens.length; i++) {
if (tempObj[tokens[i]]) {
tempObj = tempObj[tokens[i]]
} else {
tempObj[tokens[i]] = explicitArray ? [] : {}
tempObj = tempObj[tokens[i]]
}
if (Array.isArray(tempObj) && i !== tokens.length - 1) {
tempObj = tempObj[tempObj.length - 1]
}
}
if (Array.isArray(tempObj)) {
const obj = tempObj[tempObj.length - 1]
if (!obj[textKey]) {
obj[textKey] = ''
}
obj[textKey] = obj[textKey] + text
if (!preserveWhitespace) {
obj[textKey] = obj[textKey].replace(/\s+/g, ' ').trim()
}
} else {
if (!tempObj[textKey]) {
tempObj[textKey] = ''
}
tempObj[textKey] = tempObj[textKey] + text
if (!preserveWhitespace) {
tempObj[textKey] = tempObj[textKey].replace(/\s+/g, ' ').trim()
}
}
}
function checkForResourcePath(name: string) {
if (resourcePath) {
if (state.currentPath.indexOf(resourcePath) === 0) {
state.isPathfound = true
} else {
state.isPathfound = false
}
} else {
if (_.includes(interestedNodes, name, 0)) {
state.isPathfound = true
if (!state.firstFoundNode) {
state.firstFoundNode = name
}
}
}
}
function getRelativePath() {
let tokens
let jsonPath
let index
if (resourcePath) {
let xpath = state.currentPath.substring(resourcePath.length)
if (!xpath) {
return
}
if (xpath[0] === '/') {
xpath = xpath.substring(1)
}
tokens = xpath.split('/')
jsonPath = tokens.join('.')
} else {
const xpath = state.currentPath.substring(1)
tokens = xpath.split('/')
index = tokens.indexOf(state.firstFoundNode)
tokens = _.drop(tokens, index + 1)
jsonPath = tokens.join('.')
}
return jsonPath
}
}
function processError(err: Error) {
const parser = this.parser
let error: Error = null
if (err) {
error = err
} else {
error = parser.getError()
}
error = new Error(`${error} at line no: ${parser.getCurrentLineNumber()}`)
this.emit('error', error)
return error
}