Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
indent_style = space
charset = utf-8
indent_size = 2
e4x = true

[package.json]
indent_style = space
indent_size = 2

3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
.DS_Store
/assets/
/node_modules/

.idea
*.iml
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,38 @@ Selector syntax is CSS-like and currently supports:

Take a look at the examples for more information.

## Options

* `encoding`: Default 'utf8'
* `element`: Parent level tag, most likely an array, to iterate over. (**Required**)
* `output`: Stream output format `xml` or `json`. Defaults to `xml`.
* `attributes`: Should tag attributes be returned. Defaults to `true`.
* `collect`: An array of nested array tags that should be parsed. Defaults to `[]`
* `preserve`: A collection of items to preserve, see below for more details. Format: `{ "${item}":${whitespace|true} }`

## Getting started
```javascript
const fs = require('fs');
const xmlStream = require('xml-stream');

const input = fs.createReadStream('json.xml');
const parse = new xmlStream({
element: 'media',
attributes: false,
output:'json',
preserve: {},
collect:['id']
});

parse.on('data', function(data) {
console.log(data);
});


input
.pipe(parse);
```

## Element Node

Each of the four node events has a callback with one argument. When parsing,
Expand Down
20 changes: 20 additions & 0 deletions examples/json.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

const fs = require('fs');
const xmlStream = require('../');

const input = fs.createReadStream('json.xml');
const parse = new xmlStream({
element: 'media',
attributes: false,
output:'json',
preserve: {},
collect:['id']
});

parse.on('data', function(data) {
console.log(data);
});


input
.pipe(parse);
27 changes: 27 additions & 0 deletions examples/json.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<root>
<media mediaId="value" lastModified="date" action="add">
<title size="140" type="full" lang="en">Some title</title>
<ids>
<id type="rootId">10000020</id>
<id type="seriesId">10000020</id>
<id type="TMSId">SH017461480000</id>
</ids>
<image type="image/jpg" width="270" height="360" primary="true" category="Banner">
<URI>Some URL</URI>
<caption lang="en">Some title</caption>
</image>
</media>
<media mediaId="p10000020_b_v4_aa" lastModified="2013-06-14T00:00:00Z" action="add">
<title size="141" type="full" lang="en">Some title</title>
<ids>
<id type="rootId">10000020</id>
<id type="seriesId">10000020</id>
<id type="TMSId">SH017461480000</id>
</ids>
<image type="image/jpg" width="540" height="720" primary="true" category="Banner">
<URI>Some URL 2</URI>
<caption lang="en">Some title</caption>
</image>
</media>
</root>
145 changes: 94 additions & 51 deletions lib/xml-stream.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
var events = require('events')
var stream = require('stream')
, expat = require('node-expat')
, FiniteAutomata = require('./finite-automata')
, Iconv = require('iconv').Iconv
, util = require('util')
;

// Retains link to hasOwnProperty.
Expand Down Expand Up @@ -44,9 +45,8 @@ module.exports = XmlStream;
// **XmlStream** is an XML stream filter based on Expat.
// It traverses a given stream and emits events for predefined selectors.
// Event listeners receive selected elements, context, and trace from root.
function XmlStream(stream, encoding) {
events.EventEmitter.call(this);
this._stream = stream;
function XmlStream(options) {
this._stream = this;
this._fa = new FiniteAutomata();
this._lastState = 0;
this._startState = {};
Expand All @@ -57,16 +57,39 @@ function XmlStream(stream, encoding) {
this._preserveWhitespace = 0;
this._preserveAll = false;
this._collect = false;
this.preludeBuffers = [];
this.prelude = '';
this._parser = undefined;

this._options = options || {};
this._element = this._options.element || 'xml';
this._attributes = !!this._options.attributes; // `true` or `false`
this._output = this._options.output || 'xml'; // `xml` or `json`

// Set input stream encoding and create an iconv instance,
// if conversion is required. Default working encoding is UTF-8,
// so iconv is used when input is anything else, but UTF-8.
this._encoding = encoding || null;
this._encoding = this._options.encoding || null;
this._encoder = makeEncoder(this._encoding);

// Setup stream as stream
this.writable = true;

var scope = this;

// Pre init collectors
// TODO change option to auto parse sub collections
this._collections = this._options.collect || [];
this._collections.map(function (selector) {
scope.collect(selector);
return selector;
});

this._preserve = this._options.preserve || {};
Object.keys(this._preserve).map(function (key) {
scope.preserve(key, this._preserve[key]);
});

// Start parsing.
process.nextTick(function () {
parse.call(scope);
Expand All @@ -81,14 +104,38 @@ function makeEncoder(encoding) {
return null;
}

// Inherit events.EventEmitter.
XmlStream.super_ = events.EventEmitter;
XmlStream.prototype = Object.create(events.EventEmitter.prototype, {
constructor: {
value: XmlStream,
enumerable: false
}
});
// Inherits Stream
util.inherits(XmlStream, stream.Stream);

// Setup stream
XmlStream.prototype.write = function (data) {
if(this._encoding) {
this.parseChunk(data);
} else {
// We can't parse when the encoding is unknown, so we'll look into
// the XML declaration, if there is one. For this, we need to buffer
// incoming data until a full tag is received.
this.preludeBuffers.push(data);
this.prelude += data.toString();
if (/^\s*<[^>]+>/.test(this.prelude)) {
var matches = this.prelude.match(/^\s*<\?xml[^>]+encoding="(.+?)"[^>]*\?>/);
this._encoding = matches ? matches[1] : 'utf8';
this._encoder = makeEncoder(this._encoding);
for (var i = 0, n = this.preludeBuffers.length; i < n; i++) {
this.parseChunk(this.preludeBuffers[i]);
}
}
}
};

XmlStream.prototype.end = function () {
if (!this.xml.parse('', true)) {
this.emit('error', this.xml.getError());
} else {
// This causes looping when piping?
this.emit('end');
}
};

// Adds a listener for the specified event.
//
Expand Down Expand Up @@ -188,7 +235,7 @@ XmlStream.prototype.pause = function() {
}

return this;
}
};

// resume expat
XmlStream.prototype.resume = function() {
Expand All @@ -204,7 +251,7 @@ XmlStream.prototype.resume = function() {
}

return this;
}
};

// Normalizes the selector and returns the new version and its parts.
function normalizeSelector(selector) {
Expand Down Expand Up @@ -259,6 +306,13 @@ function getFinalState(selector) {
return finalState;
}

// Emits JSON for element
function emitJson(name, data) {
if (this._element === name) {
this.emit('data', data.element);
}
}

// Emits XML for element opening tag.
function emitStart(name, attrs) {
this.emit('data', '<' + name);
Expand Down Expand Up @@ -341,7 +395,7 @@ function emitOneElement(element, name, onLeave) {
// The Expat parser is assigned several listeners for this purpose.
function parse() {
var self = this;
var xml = new expat.Parser('utf-8');
var xml = this.xml = new expat.Parser('utf-8');
this._parser = xml;
this._suspended = false;
var stack = [];
Expand Down Expand Up @@ -380,7 +434,7 @@ function parse() {
context: context
};
self._collect = false;
fa.enter(name, [element, context, trace]);
fa.enter(name, [element, context, trace]); // sets _collect from options
if (self._preserveLevel > 0) {
element.$children = [];
}
Expand All @@ -400,7 +454,9 @@ function parse() {
context[name] = element;
}
if (self._bufferLevel === 0 && self._emitData) {
emitStart.call(self, name, element.$);
if (self._output === 'xml') {
emitStart.call(self, name, element.$);
}
}
});

Expand All @@ -410,7 +466,7 @@ function parse() {
// removed from the stack afterwards.
xml.on('endElement', function(name) {
self.emit('endElement', name);
var prev = stack.pop();
var prev = stack.pop(); // aka parent tag
var element = curr.element;
var text = curr.fullText;
var attr = element.$;
Expand All @@ -434,7 +490,13 @@ function parse() {
if (self._bufferLevel > 0 || self._preserveLevel > 0) {
element.$name = name;
}

if (!self._attributes && !isEmpty(curr.element.$) && element.$text) {
val = text;
}

curr.context[name] = val;

if (curr.collect) {
var container = prev.element[name];
container[container.length - 1] = val;
Expand All @@ -445,8 +507,17 @@ function parse() {
if (self._preserveLevel > 0) {
prev.element.$children.push(val);
}

if (!self._attributes && !isEmpty(curr.element.$)) {
delete element.$;
}

if (self._bufferLevel === 0 && self._emitData) {
emitEnd.call(self, name);
if (self._output === 'xml') {
emitEnd.call(self, name);
} else if (self._output === 'json') {
emitJson.call(self, name, curr);
}
}
curr = prev;
this._collect = curr.collect;
Expand All @@ -458,7 +529,7 @@ function parse() {
curr.element.$text = text;
fa.run('state', [curr.element, curr.context, trace]);
if (self._bufferLevel === 0 && self._emitData) {
emitText.call(self, text);
//emitText.call(self, text);
}
if (!self._preserveAll) {
var trimmed = curr.element.$text.trim();
Expand Down Expand Up @@ -516,41 +587,13 @@ function parse() {

// Parse incoming chunk.
// Convert to UTF-8 or emit errors when appropriate.
var parseChunk = function(data) {
this.parseChunk = function(data) {
if (self._encoder) {
data = self._encoder.convert(data);
}
if (!xml.parse(data, false)) {
self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber()));
}
}

// Pass data from stream to parser.
this._stream.on('data', function(data) {
if (self._encoding) {
parseChunk(data);
} else {
// We can't parse when the encoding is unknown, so we'll look into
// the XML declaration, if there is one. For this, we need to buffer
// incoming data until a full tag is received.
preludeBuffers.push(data);
prelude += data.toString();
if (/^\s*<[^>]+>/.test(prelude)) {
var matches = prelude.match(/^\s*<\?xml[^>]+encoding="(.+?)"[^>]*\?>/);
self._encoding = matches ? matches[1] : 'utf8';
self._encoder = makeEncoder(self._encoding);
for (var i = 0, n = preludeBuffers.length; i < n; i++) {
parseChunk(preludeBuffers[i]);
}
}
}
});
};

// End parsing on stream EOF and emit an *end* event ourselves.
this._stream.on('end', function() {
if (!xml.parse('', true)) {
self.emit('error', new Error(xml.getError()+" in line "+xml.getCurrentLineNumber()));
}
self.emit('end');
});
}