-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Speed up JSON parsing with simdjson? #51596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
I think it is an interesting thing to consider though we should be aware that it most likely not going to be an easy and simple win if we just try to shove The current
A straightforward way to use My main concern here would be:
For the sake of a completeness here is a very naive benchmark, I have done: $ dart compile exe bin/json_benchmark.dart
$ bin/json_benchmark.exe
jsonDecode(from string): 119.26 (10.740) 110..130 p90: 125
jsonDecode(from bytes): 117.64 (9.360) 108..127 p90: 123
json reader -> empty processor: 69.38 (3.620) 68..73 p90: 70
json reader with manual parse: 21.92 (0.080) 21..22 p90: 22
$ clang++ -O3 -std=c++20 -o parse parse.cpp simdjson.cpp
$ ./parse
simdjson: 4.41 (0.859) 4..13 p90: 5 So import 'dart:convert';
import 'dart:io';
import 'dart:math' as math;
import 'dart:typed_data';
import 'package:jsontool/jsontool.dart';
void measure(String name, void Function() cb) {
const nRuns = 100;
final timings = Int64List(nRuns);
for (var i = 0; i < nRuns; i++) {
final sw = Stopwatch()..start();
cb();
final elapsed = sw.elapsedMilliseconds;
timings[i] = elapsed;
}
timings.sort();
final sum = timings.fold(0, (sum, v) => sum + v);
final mean = sum / nRuns;
final min = timings.first;
final max = timings.last;
final stdDev = math.sqrt(timings.fold(0.0, (sum, v) {
final x = v - mean;
return x * x;
}));
print(
'$name: $mean (${stdDev.toStringAsFixed(3)}) $min..$max p90: ${timings[90]}');
}
class MyProcessor extends JsonProcessor {}
void main(List<String> arguments) {
// curl -o data.json https://conda.anaconda.org/conda-forge/noarch/current_repodata.json
final fileAsString = File('data.json').readAsStringSync();
final fileAsBytes = File('data.json').readAsBytesSync();
measure('jsonDecode(from string)', () => jsonDecode(fileAsString));
measure('jsonDecode(from bytes)',
() => utf8.decoder.fuse(json.decoder).convert(fileAsBytes));
measure('json reader -> empty processor',
() => MyProcessor().processValue(JsonReader.fromUtf8(fileAsBytes)));
measure('json reader with manual parse', () {
int cnt = 0;
final reader = JsonReader.fromUtf8(fileAsBytes);
reader.expectObject();
while (reader.hasNextKey()) {
final prop = reader.nextKey()!;
if (prop == 'packages') {
reader.expectObject();
while (reader.skipObjectEntry() && reader.hasNext()) {
cnt++;
}
break;
} else {
reader.skipAnyValue();
}
}
if (cnt != 23004) {
throw 'counted $cnt packages, but expected 23004';
}
});
} // $ curl -o data.json https://conda.anaconda.org/conda-forge/noarch/current_repodata.json
// $ clang++ -O3 -std=c++20 -o parse parse.cpp simdjson.cpp
// $ ./parse
#include <algorithm>
#include <array>
#include <iostream>
#include <numeric>
#include "simdjson.h"
using namespace simdjson;
template <typename F>
void measure(std::string_view name, F &&f) {
static constexpr int NRuns = 100;
std::array<int64_t, NRuns> timings;
for (intptr_t run = 0; run < NRuns; run++) {
const auto start = std::chrono::steady_clock::now();
f();
const auto end = std::chrono::steady_clock::now();
timings[run] =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
}
std::sort(timings.begin(), timings.end());
const int64_t sum = std::accumulate(timings.begin(), timings.end(), 0);
const double mean = static_cast<double>(sum) / NRuns;
const int64_t median = timings[timings.size() / 2];
const int64_t min = timings[0];
const int64_t max = timings[NRuns - 1];
const double stddev =
sqrt(std::accumulate(timings.begin(), timings.end(), 0.0,
[&](double sum, int64_t v) -> double {
double x = static_cast<double>(v) - mean;
return x * x;
}) /
NRuns);
std::cout << name << ": " << mean << " (" << stddev << ")"
<< " " << min << ".." << max << " p90: " << timings[90]
<< std::endl;
}
int main(void) {
padded_string json = padded_string::load("data.json");
measure("simdjson", [&]() {
ondemand::parser parser;
ondemand::document data = parser.iterate(json);
int count = 0;
for (auto el : data["packages"].get_object()) {
count++;
}
if (count != 23004) {
std::cerr << "got " << count << " packages, but expected 23004"
<< std::endl;
exit(1);
}
});
return 0;
} FWIW I think a more interesting thing to consider is fusing JSON deserialization and actual creation of data model from JSON. e.g. if instead of giving Something like a plugin for (I am leaving aside the fact that |
Would it make sense for us to replace our JSON parser with simdjson.org's parser?
The text was updated successfully, but these errors were encountered: