diff --git a/benchmark/REQUIRE b/benchmark/REQUIRE new file mode 100644 index 00000000..1909c590 --- /dev/null +++ b/benchmark/REQUIRE @@ -0,0 +1 @@ +DataTables diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl new file mode 100644 index 00000000..73c5fa00 --- /dev/null +++ b/benchmark/benchmarks.jl @@ -0,0 +1,22 @@ +using PkgBenchmark +using Query +using DataTables + +@benchgroup "Query" begin + N = 100_000_000; + A = rand(N); + B = rand(1:100, N); + dt = DataTable([A, B], [:A, :B]); + + @bench "group" @from i in $dt begin + @group i.A by i.B into g + @select {m = mean(g)} + @collect DataTable + end + + @bench "group2" @from i in $dt begin + @group i.A by i.B into g + @select {m = mean(g)} + @collect DataTable + end +end diff --git a/benchmark/perf.jl b/benchmark/perf.jl new file mode 100644 index 00000000..068577ce --- /dev/null +++ b/benchmark/perf.jl @@ -0,0 +1,32 @@ +using DataTables, Query + +N = 100_000_000; +A = rand(N); +B = rand(1:100, N); +dt = DataTable([A, B], [:A, :B]); +dt = DataTable(A = NullableArray(A), B = NullableArray(B)); + +@time by(dt, :B, d -> mean(d[:A])); + +@time x = @from i in dt begin + @group i.A by i.B into g + @select {m = mean(g)} + @collect DataTable +end; + +function foo1(dt) + by(dt, :B, d -> mean(d[:A])) +end + +function foo2(dt) + x = @from i in dt begin + @group i.A by i.B into g + @select {m = mean(g)} + @collect DataTable + end +end + +@time foo1(dt); +@time foo2(dt); + +@profile foo2(dt); diff --git a/src/enumerable/enumerable_groupby.jl b/src/enumerable/enumerable_groupby.jl index b1acb600..aa0cde41 100644 --- a/src/enumerable/enumerable_groupby.jl +++ b/src/enumerable/enumerable_groupby.jl @@ -9,7 +9,8 @@ immutable Grouping{TKey,T} <: AbstractArray{T,1} end import Base.size -size{TKey,T}(A::Grouping{TKey,T}) = size(A.elements) +size{TKey,T}(A::Grouping{TKey,T}) = (length(A.elements),) +Base.IndexStyle(::Type{<:Grouping}) = IndexLinear() import Base.getindex getindex{TKey,T}(A::Grouping{TKey,T},i) = A.elements[i] import Base.length @@ -63,6 +64,10 @@ immutable EnumerableGroupBy{T,TKey,TR,SO,ES<:Function,RS<:Function} <: Enumerabl resultSelector::RS end +IterableTables.iteratorsize2(::Type{<:EnumerableGroupBy}) = IterableTables.HasLengthAfterStart() + +Base.length(iter::EnumerableGroupBy, state) = length(state[1]) + Base.eltype{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}) = T Base.eltype{T,TKey,TR,SO,ES}(iter::Type{EnumerableGroupBy{T,TKey,TR,SO,ES}}) = T @@ -85,25 +90,33 @@ end # TODO This should be rewritten as a lazy iterator function start{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}) - result = OrderedDict{TKey,T}() + result = OrderedDict{TKey,Grouping{TKey,TR}}() for i in iter.source key = iter.elementSelector(i) - if !haskey(result, key) - result[key] = Grouping(key,Array{TR}(0)) - end - push!(result[key].elements,iter.resultSelector(i)) + let key=key + g = get!(result, key) do + return Grouping{TKey, TR}(key,Array{TR,1}(0)) + end + push!(g.elements,iter.resultSelector(i)) + end end - return collect(values(result)),1 + dict_iterator = values(result) + return dict_iterator,start(dict_iterator) end function next{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}, state) - results = state[1] - curr_index = state[2] - return results[curr_index], (results, curr_index+1) + dict_iterator = state[1] + dict_iterator_state = state[2] + + x = next(dict_iterator, dict_iterator_state) + v = x[1] + dict_iterator_state_new = x[2] + + return v, (dict_iterator, dict_iterator_state_new) end function done{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}, state) - results = state[1] - curr_index = state[2] - return curr_index > length(results) + dict_iterator = state[1] + dict_iterator_state = state[2] + return done(dict_iterator, dict_iterator_state) end diff --git a/src/enumerable/enumerable_select.jl b/src/enumerable/enumerable_select.jl index b4a815fe..ac376f8d 100644 --- a/src/enumerable/enumerable_select.jl +++ b/src/enumerable/enumerable_select.jl @@ -3,6 +3,8 @@ immutable EnumerableSelect{T, S, Q<:Function} <: Enumerable f::Q end +IterableTables.iteratorsize2{T,S,Q}(::Type{EnumerableSelect{T,S,Q}}) = IterableTables.iteratorsize2(S) + Base.iteratorsize{T,S,Q}(::Type{EnumerableSelect{T,S,Q}}) = Base.iteratorsize(S) Base.eltype{T,S,Q}(iter::EnumerableSelect{T,S,Q}) = T @@ -11,6 +13,8 @@ Base.eltype{T,S,Q}(iter::Type{EnumerableSelect{T,S,Q}}) = T Base.length{T,S,Q}(iter::EnumerableSelect{T,S,Q}) = length(iter.source) +Base.length{T,S,Q}(iter::EnumerableSelect{T,S,Q}, state) = length(iter.source, state) + function select(source::Enumerable, f::Function, f_expr::Expr) TS = eltype(source) T = Base.return_types(f, (TS,))[1] diff --git a/src/sources/source_iterable.jl b/src/sources/source_iterable.jl index 8d7edb83..7e34e5a8 100644 --- a/src/sources/source_iterable.jl +++ b/src/sources/source_iterable.jl @@ -25,9 +25,8 @@ function start{T,S}(iter::EnumerableIterable{T,S}) return start(iter.source) end -function next{T,S}(iter::EnumerableIterable{T,S}, state) - source_value, source_next_state = next(iter.source, state) - return source_value, source_next_state +@inline function next{T,S}(iter::EnumerableIterable{T,S}, state) + return next(iter.source, state) end function done{T,S}(iter::EnumerableIterable{T,S}, state)