diff --git a/benchmark/Rdatatable.jl b/benchmark/Rdatatable.jl index 2939ae2d..d8480dc8 100644 --- a/benchmark/Rdatatable.jl +++ b/benchmark/Rdatatable.jl @@ -4,7 +4,7 @@ using StatsBase, RCall, Query, DataFrames, DataFramesMeta, DataTables using IndexedTables using IndexedTables.Table -function R_datatable(N,K) +function R_datatable(N, K) R""" library(data.table) @@ -43,7 +43,7 @@ function R_datatable(N,K) end -function R_dplyr(N,K) +function R_dplyr(N, K) R""" library(dplyr) @@ -81,217 +81,217 @@ function R_dplyr(N,K) return timings end -function createDataFrame(N::Int,K::Int) +function createDataFrame(N::Int, K::Int) - df = DataFrame(id1 = sample(["id$x" for x in 1:K],N), - id2 = sample(["id$x" for x in 1:K],N), - id3 = sample(["id$x" for x in 1:(N/K)],N), - id4 = sample(1:K,N), - id5 = sample(1:K,N), - id6 = sample(1:(N/K),N), - v1 = sample(1:5,N), - v2 = sample(1:5,N), - v3 = sample(round.(rand(100),4),N)) + df = DataFrame(id1=sample(["id$x" for x in 1:K], N), + id2=sample(["id$x" for x in 1:K], N), + id3=sample(["id$x" for x in 1:(N / K)], N), + id4=sample(1:K, N), + id5=sample(1:K, N), + id6=sample(1:(N / K), N), + v1=sample(1:5, N), + v2=sample(1:5, N), + v3=sample(round.(rand(100), 4), N)) return df end -function createDataTable(N::Int,K::Int) +function createDataTable(N::Int, K::Int) - df = DataTable(id1 = sample(["id$x" for x in 1:K],N), - id2 = sample(["id$x" for x in 1:K],N), - id3 = sample(["id$x" for x in 1:(N/K)],N), - id4 = sample(1:K,N), - id5 = sample(1:K,N), - id6 = sample(1:(N/K),N), - v1 = sample(1:5,N), - v2 = sample(1:5,N), - v3 = sample(round.(rand(100),4),N)) + df = DataTable(id1=sample(["id$x" for x in 1:K], N), + id2=sample(["id$x" for x in 1:K], N), + id3=sample(["id$x" for x in 1:(N / K)], N), + id4=sample(1:K, N), + id5=sample(1:K, N), + id6=sample(1:(N / K), N), + v1=sample(1:5, N), + v2=sample(1:5, N), + v3=sample(round.(rand(100), 4), N)) return df end -function createIndexedTable(N::Int,K::Int) - - df = Table(Columns(id1 = sample(["id$x" for x in 1:K],N), - id2 = sample(["id$x" for x in 1:K],N), - id3 = sample(["id$x" for x in 1:(N/K)],N), - id4 = sample(1:K,N), - id5 = sample(1:K,N), - id6 = sample(1:(N/K),N), - v1 = sample(1:5,N), - v2 = sample(1:5,N), - v3 = sample(round.(rand(100),4),N))) +function createIndexedTable(N::Int, K::Int) + + df = Table(Columns(id1=sample(["id$x" for x in 1:K], N), + id2=sample(["id$x" for x in 1:K], N), + id3=sample(["id$x" for x in 1:(N / K)], N), + id4=sample(1:K, N), + id5=sample(1:K, N), + id6=sample(1:(N / K), N), + v1=sample(1:5, N), + v2=sample(1:5, N), + v3=sample(round.(rand(100), 4), N))) return df end function benches(df::DataFrame) - # timings + #  timings ti = Dict() ti[:sum1] = @elapsed @from i in df begin - @group i by i.id1 into g - @select {r=sum(g.v1)} - @collect DataFrame - end + @group i by i.id1 into g + @select {r = sum(g.v1)} + @collect DataFrame + end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g.v1)} + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end return ti end function benches(df::DataTable) - # timings + #  timings ti = Dict() ti[:sum1] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g.v1)} + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g.v1)} + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end return ti end function benches(df::IndexedTable) - # timings + #  timings ti = Dict() ti[:sum1] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g.v1)} + @select {r = sum(g.v1)} @collect DataFrame - end + end ti[:sum2] = @elapsed @from i in df begin @group i by i.id1 into g - @select {r=sum(g.v1)} + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum3] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum4] = @elapsed @from i in df begin - @group i by (i.id1,i.id2) into g - @select {r=sum(g.v1)} + @group i by (i.id1, i.id2) into g + @select {r = sum(g.v1)} @collect DataFrame end ti[:sum_mean1] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:sum_mean2] = @elapsed @from i in df begin @group i by i.id3 into g - @select {s=sum(g.v1),m=mean(g.v3)} + @select {s = sum(g.v1),m = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin @group i by i.id4 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin - @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @group i by i.id6 into g + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin @group i by i.id6 into g - @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)} + @select {m7 = mean(g.v1),m8 = mean(g.v2),m9 = mean(g.v3)} @collect DataFrame end return ti @@ -299,7 +299,7 @@ end function DfMeta_benches(df::DataFrame) - # timings + #  timings ti = Dict() ti[:sum1] = @elapsed @linq df |> @@ -308,82 +308,82 @@ function DfMeta_benches(df::DataFrame) ti[:sum2] = @elapsed @linq df |> @by(:id1,r = sum(:v1)) - ti[:sum3] = @elapsed @linq df |> + ti[:sum3] = @elapsed @linq df |> @by([:id1,:id2],r = sum(:v1)) ti[:sum4] = @elapsed @linq df |> @by([:id1,:id2],r = sum(:v1)) ti[:sum_mean1] = @elapsed @linq df |> - @by(:id3,s = sum(:v1),m=mean(:v1)) + @by(:id3,s = sum(:v1),m = mean(:v1)) ti[:sum_mean2] = @elapsed @linq df |> - @by(:id3,s = sum(:v1),m=mean(:v1)) + @by(:id3,s = sum(:v1),m = mean(:v1)) ti[:mean7_9_by_id4_1] = @elapsed @linq df |> - @by(:id4,m7=mean(:v1),m8=mean(:v2),m9=mean(:v3)) + @by(:id4,m7 = mean(:v1),m8 = mean(:v2),m9 = mean(:v3)) ti[:mean7_9_by_id4_2] = @elapsed @linq df |> - @by(:id4,m7=mean(:v1),m8=mean(:v2),m9=mean(:v3)) + @by(:id4,m7 = mean(:v1),m8 = mean(:v2),m9 = mean(:v3)) ti[:sum7_9_by_id6_1] = @elapsed @linq df |> - @by(:id6,m7=mean(:v1),m8=mean(:v2),m9=mean(:v3)) - + @by(:id6,m7 = mean(:v1),m8 = mean(:v2),m9 = mean(:v3)) + ti[:sum7_9_by_id6_2] = @elapsed @linq df |> - @by(:id6,m7=mean(:v1),m8=mean(:v2),m9=mean(:v3)) + @by(:id6,m7 = mean(:v1),m8 = mean(:v2),m9 = mean(:v3)) return ti end function run_benches(N=1_000_000;K=100) # get small data for JIT warmup - d_ = createDataFrame(10,3) - dt_ = createDataTable(10,3) - di_ = createIndexedTable(10,3) + d_ = createDataFrame(10, 3) + dt_ = createDataTable(10, 3) + di_ = createIndexedTable(10, 3) - # warm up julia benchmarks + #  warm up julia benchmarks benches(d_); benches(dt_); benches(di_); DfMeta_benches(d_); - # get real data - d = createDataFrame(N,K) + #  get real data + d = createDataFrame(N, K) # measure DataFrames query_df = benches(d) meta = DfMeta_benches(d) d = 0 gc() # measure DataTables - dt = createDataTable(N,K) + dt = createDataTable(N, K) query_dt = benches(dt) dt = 0 gc() # measure IndexedTables - di = createIndexedTable(N,K) + di = createIndexedTable(N, K) query_di = benches(di) di = 0 gc() # get R data.table time - R = R_datatable(N,K) + R = R_datatable(N, K) # get R dplyr time - Rdplyr = R_dplyr(N,K) + Rdplyr = R_dplyr(N, K) # get k = collect(keys(query_df)) - out = DataFrame(bench = k,Query_DF = [query_df[kk] for kk in k],Query_DT= [query_dt[kk] for kk in k],Query_idxT= [query_di[kk] for kk in k],DataFramesMeta=[meta[kk] for kk in k], Rdplyr=[Rdplyr[kk] for kk in k], Rdatatable=[R[kk] for kk in k]) - sort!(out,cols=:bench) + out = DataFrame(bench=k, Query_DF=[query_df[kk] for kk in k], Query_DT=[query_dt[kk] for kk in k], Query_idxT=[query_di[kk] for kk in k], DataFramesMeta=[meta[kk] for kk in k], Rdplyr=[Rdplyr[kk] for kk in k], Rdatatable=[R[kk] for kk in k]) + sort!(out, cols=:bench) rel = deepcopy(out) - rel = @transform(rel,Query_DF = :Query_DF./:Rdatatable, Query_DT = :Query_DT./:Rdatatable, Query_idxT = :Query_idxT./:Rdatatable, DataFramesMeta = :DataFramesMeta ./ :Rdatatable, Rdplyr = :Rdplyr ./ :Rdatatable, Rdatatable = 1.0) - return (out,rel) + rel = @transform(rel,Query_DF = :Query_DF ./ :Rdatatable, Query_DT = :Query_DT ./ :Rdatatable, Query_idxT = :Query_idxT ./ :Rdatatable, DataFramesMeta = :DataFramesMeta ./ :Rdatatable, Rdplyr = :Rdplyr ./ :Rdatatable, Rdatatable = 1.0) + return (out, rel) end function run_all() - d=Dict() + d = Dict() for n in [10_000, 100_000, 1_000_000, 2_000_000] d[n] = run_benches(n) gc() diff --git a/docs/make.jl b/docs/make.jl index a3739b11..7f020fd9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,10 +1,10 @@ using Documenter, Query, DataFrames makedocs( - modules = [Query], - sitename = "Query.jl", + modules=[Query], + sitename="Query.jl", analytics="UA-132838790-1", - pages = [ + pages=[ "Introduction" => "index.md", "Getting Started" => "gettingstarted.md", "Standalone Query Commands" => "standalonequerycommands.md", @@ -16,5 +16,5 @@ makedocs( ) deploydocs( - repo = "github.com/queryverse/Query.jl.git" + repo="github.com/queryverse/Query.jl.git" ) diff --git a/example/01-DataFrame.jl b/example/01-DataFrame.jl index 1a31a38a..fa3034bb 100644 --- a/example/01-DataFrame.jl +++ b/example/01-DataFrame.jl @@ -4,8 +4,8 @@ using DataFrames df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,5,2]) x = @from i in df begin - @where i.age>30. && i.children > 2 - @select {Name=lowercase(i.name)} + @where i.age > 30. && i.children > 2 + @select {Name = lowercase(i.name)} @collect DataFrame end diff --git a/example/02-Dict.jl b/example/02-Dict.jl index 28ed7622..3086ecb5 100644 --- a/example/02-Dict.jl +++ b/example/02-Dict.jl @@ -1,19 +1,19 @@ using Query using DataFrames -source = Dict("John"=>34., "Sally"=>56.) +source = Dict("John" => 34., "Sally" => 56.) result = @from i in source begin - @where i.second>36. - @select {Name=lowercase(i.first)} - @collect DataFrame + @where i.second > 36. + @select {Name = lowercase(i.first)} + @collect DataFrame end println(result) result = @from i in source begin - @where i.second>36. - @select {Name=lowercase(i.first)} + @where i.second > 36. + @select {Name = lowercase(i.first)} @collect end diff --git a/example/03-Array.jl b/example/03-Array.jl index c98ec6e2..9c99caeb 100644 --- a/example/03-Array.jl +++ b/example/03-Array.jl @@ -11,16 +11,16 @@ push!(source, Person("John", ["Sally", "Miles", "Frank"])) push!(source, Person("Sally", ["Don", "Martin"])) result = @from i in source begin - @where length(i.Friends) > 2 - @select {i.Name, Friendcount=length(i.Friends)} - @collect + @where length(i.Friends) > 2 + @select {i.Name, Friendcount = length(i.Friends)} + @collect end println(result) result = @from i in source begin @where length(i.Friends) > 2 - @select {i.Name, Friendcount=length(i.Friends)} + @select {i.Name, Friendcount = length(i.Friends)} @collect DataFrame end diff --git a/example/05-NA.jl b/example/05-NA.jl index fbb661e8..02fee746 100644 --- a/example/05-NA.jl +++ b/example/05-NA.jl @@ -4,8 +4,8 @@ using DataFrames df = DataFrame(name=["John", missing, "Kirk"], age=[23., 42., 59.], children=[3,5,2]) x = @from i in df begin - @where i.age>30 && i.children >2 - @select {Name=lowercase(i.name)} + @where i.age > 30 && i.children > 2 + @select {Name = lowercase(i.name)} @collect DataFrame end diff --git a/example/08-join.jl b/example/08-join.jl index 5b5bdd6c..887c2093 100644 --- a/example/08-join.jl +++ b/example/08-join.jl @@ -4,8 +4,8 @@ df1 = DataFrame(a=[1,2,3], b=[1.,2.,3.]) df2 = DataFrame(c=[2.,4.,2.], d=["John", "Jim","Sally"]) x = @from i in df1 begin - @join j in df2 on i.a equals convert(Int,j.c) - @select {i.a,i.b,j.c,j.d,e="Name: $(j.d)"} + @join j in df2 on i.a equals convert(Int, j.c) + @select {i.a,i.b,j.c,j.d,e = "Name: $(j.d)"} @collect DataFrame end diff --git a/example/09-let.jl b/example/09-let.jl index 548c616e..7cafc406 100644 --- a/example/09-let.jl +++ b/example/09-let.jl @@ -7,7 +7,7 @@ x = @from i in df begin @let count = length(i.name) @let kids_per_year = i.children / i.age @where count > 4 - @select {Name=i.name, Count=count, KidsPerYear=kids_per_year} + @select {Name = i.name, Count = count, KidsPerYear = kids_per_year} @collect DataFrame end diff --git a/example/10-orderby.jl b/example/10-orderby.jl index 1594abc5..87347084 100644 --- a/example/10-orderby.jl +++ b/example/10-orderby.jl @@ -5,7 +5,7 @@ df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3, x = @from i in df begin @orderby i.age - @select {Name=lowercase(i.name)} + @select {Name = lowercase(i.name)} @collect DataFrame end @@ -13,7 +13,7 @@ println(x) x = @from i in df begin @orderby descending(i.age) - @select {Name=lowercase(i.name)} + @select {Name = lowercase(i.name)} @collect DataFrame end @@ -21,7 +21,7 @@ println(x) x = @from i in df begin @orderby ascending(i.age) - @select {Name=lowercase(i.name)} + @select {Name = lowercase(i.name)} @collect DataFrame end diff --git a/example/13-selectmany.jl b/example/13-selectmany.jl index 1ef21540..98b0dc6d 100644 --- a/example/13-selectmany.jl +++ b/example/13-selectmany.jl @@ -11,11 +11,11 @@ end println(q) -source_dict = Dict(:a=>[1,2,3], :b=>[4,5]) +source_dict = Dict(:a => [1,2,3], :b => [4,5]) q = @from i in source_dict begin @from j in i.second - @select {Key=i.first,Value=j} + @select {Key = i.first,Value = j} @collect DataFrame end diff --git a/example/15-groupinto.jl b/example/15-groupinto.jl index 75a13554..7582517b 100644 --- a/example/15-groupinto.jl +++ b/example/15-groupinto.jl @@ -5,7 +5,7 @@ df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3, x = @from i in df begin @group i by i.children into g - @select {Key=key(g),Count=length(g)} + @select {Key = key(g),Count = length(g)} @collect DataFrame end diff --git a/example/17-groupjoin.jl b/example/17-groupjoin.jl index 54e057ec..c54b8ca0 100644 --- a/example/17-groupjoin.jl +++ b/example/17-groupjoin.jl @@ -4,9 +4,9 @@ df1 = DataFrame(a=[1,2,3], b=[1.,2.,3.]) df2 = DataFrame(c=[2.,4.,2.], d=["John", "Jim","Sally"]) x = @from i in df1 begin - @join j in df2 on i.a equals convert(Int,j.c) into k - @where i.a>1 - @select {t1=i,t2=k} + @join j in df2 on i.a equals convert(Int, j.c) into k + @where i.a > 1 + @select {t1 = i,t2 = k} @collect DataFrame end diff --git a/example/18-orderby-nested.jl b/example/18-orderby-nested.jl index 927569b3..d122a59d 100644 --- a/example/18-orderby-nested.jl +++ b/example/18-orderby-nested.jl @@ -1,7 +1,7 @@ using Query using DataFrames -df = DataFrame(a=[2,1,1,2,1,3],b=[2,2,1,1,3,2]) +df = DataFrame(a=[2,1,1,2,1,3], b=[2,2,1,1,3,2]) x = @from i in df begin @orderby descending(i.a), i.b diff --git a/example/23-dict-sink.jl b/example/23-dict-sink.jl index 18304d7c..d0e12533 100644 --- a/example/23-dict-sink.jl +++ b/example/23-dict-sink.jl @@ -4,7 +4,7 @@ using DataFrames df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,5,2]) q = @from i in df begin - @select i.name=>i.children + @select i.name => i.children @collect Dict end diff --git a/example/25-ab-syntax.jl b/example/25-ab-syntax.jl index 7550b083..143642ce 100644 --- a/example/25-ab-syntax.jl +++ b/example/25-ab-syntax.jl @@ -2,13 +2,13 @@ using Query using DataFrames using Statistics -df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), - age=vcat([10., 20., 30.],[10., 20., 30.].+3), - children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) +df = DataFrame(name=repeat(["John", "Sally", "Kirk"], inner=[1], outer=[2]), + age=vcat([10., 20., 30.], [10., 20., 30.] .+ 3), + children=repeat([3,2,2], inner=[1], outer=[2]),state=[:a,:a,:a,:b,:b,:b]) x = @from i in df begin @group i by i.state into g - @select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)} + @select {group = key(g),mage = mean(g.age), oldest = maximum(g.age), youngest = minimum(g.age)} @collect DataFrame end diff --git a/example/prep_data.jl b/example/prep_data.jl index 7821f5dd..0c0576c8 100644 --- a/example/prep_data.jl +++ b/example/prep_data.jl @@ -1,14 +1,14 @@ using DataFrames -n=10_000_000 +n = 10_000_000 # Right now things only work columns of type Array, so # we need this slighlty cumbersome DataFrame construction # to prevent DataArray or NullableArray creation # We are also skipping all Strings because of #14955 (I think) -data_friends = fill(4,n) -data_age = fill(38.2,n) -data_children = fill(2,n) +data_friends = fill(4, n) +data_age = fill(38.2, n) +data_children = fill(2, n) columns = [] push!(columns, data_friends) diff --git a/src/Query.jl b/src/Query.jl index ff77807e..c1d3cac3 100644 --- a/src/Query.jl +++ b/src/Query.jl @@ -2,7 +2,7 @@ module Query import IterableTables using DataValues -using MacroTools: postwalk +using MacroTools:postwalk using QueryOperators export @from, @query, @count, Grouping, key @@ -23,17 +23,17 @@ include("standalone_query_macros.jl") include("table_query_macros.jl") macro from(range::Expr, body::Expr) - if range.head!=:call || (range.args[1]!=:in && range.args[1]!=in) + if range.head != :call || (range.args[1] != :in && range.args[1] != in) error() end - if body.head!=:block + if body.head != :block error() end - body.args = filter(i->!isa(i, LineNumberNode),body.args) + body.args = filter(i -> !isa(i, LineNumberNode), body.args) - insert!(body.args,1,:( @from $(range.args[2]) in $(range.args[3]) )) + insert!(body.args, 1, :( @from $(range.args[2]) in $(range.args[3]) )) translate_query(body) @@ -41,7 +41,7 @@ macro from(range::Expr, body::Expr) end macro query(range::Symbol, body::Expr) - if body.head!=:block + if body.head != :block error() end diff --git a/src/query_utils.jl b/src/query_utils.jl index 501cd18b..16a6f43c 100644 --- a/src/query_utils.jl +++ b/src/query_utils.jl @@ -1,8 +1,8 @@ ismacro(ex, name::Symbol, nargs::Integer=-1) = - isa(ex, Expr) && ex.head==:macrocall && ex.args[1]==name && - (nargs == -1 || length(ex.args) == nargs+1) + isa(ex, Expr) && ex.head == :macrocall && ex.args[1] == name && + (nargs == -1 || length(ex.args) == nargs + 1) ismacro(ex, name::String, nargs::Integer=-1) = ismacro(ex, Symbol(name), nargs) iscall(ex, name::Symbol, nargs::Integer=-1) = - isa(ex, Expr) && ex.head==:call && ex.args[1]==name && - (nargs == -1 || length(ex.args) == nargs+1) + isa(ex, Expr) && ex.head == :call && ex.args[1] == name && + (nargs == -1 || length(ex.args) == nargs + 1) diff --git a/src/standalone_query_macros.jl b/src/standalone_query_macros.jl index 0655e8fe..12434f64 100644 --- a/src/standalone_query_macros.jl +++ b/src/standalone_query_macros.jl @@ -35,7 +35,7 @@ end macro groupby(elementSelector) elementSelector_as_anonym_func = helper_replace_anon_func_syntax(elementSelector) - resultSelector_as_anonym_func = :(i->i) + resultSelector_as_anonym_func = :(i -> i) q_elementSelector = Expr(:quote, elementSelector_as_anonym_func) q_resultSelector = Expr(:quote, resultSelector_as_anonym_func) @@ -178,11 +178,11 @@ end macro map(f) f_as_anonym_func = helper_replace_anon_func_syntax(f) q = Expr(:quote, f_as_anonym_func) - return :( i-> QueryOperators.map(QueryOperators.query(i), $(esc(f_as_anonym_func)), $(esc(q))) ) |> + return :( i -> QueryOperators.map(QueryOperators.query(i), $(esc(f_as_anonym_func)), $(esc(q))) ) |> helper_namedtuples_replacement end -macro mapmany(source, collectionSelector,resultSelector) +macro mapmany(source, collectionSelector, resultSelector) collectionSelector_as_anonym_func = helper_replace_anon_func_syntax(collectionSelector) resultSelector_as_anonym_func = helper_replace_anon_func_syntax(resultSelector) @@ -195,14 +195,14 @@ macro mapmany(source, collectionSelector,resultSelector) helper_namedtuples_replacement end -macro mapmany(collectionSelector,resultSelector) +macro mapmany(collectionSelector, resultSelector) collectionSelector_as_anonym_func = helper_replace_anon_func_syntax(collectionSelector) resultSelector_as_anonym_func = helper_replace_anon_func_syntax(resultSelector) collectionSelector_q = Expr(:quote, collectionSelector_as_anonym_func) resultSelector_q = Expr(:quote, resultSelector_as_anonym_func) - return :( i-> QueryOperators.mapmany(QueryOperators.query(i), + return :( i -> QueryOperators.mapmany(QueryOperators.query(i), $(esc(collectionSelector_as_anonym_func)), $(esc(collectionSelector_q)), $(esc(resultSelector_as_anonym_func)), $(esc(resultSelector_q)))) |> helper_namedtuples_replacement @@ -239,7 +239,7 @@ macro drop(n) end macro unique() - return :( i -> QueryOperators.unique(QueryOperators.query(i), q->q, :(q->q))) |> + return :( i -> QueryOperators.unique(QueryOperators.query(i), q -> q, :(q -> q))) |> helper_namedtuples_replacement end diff --git a/src/table_query_macros.jl b/src/table_query_macros.jl index d46a8ab7..1c7ab70d 100644 --- a/src/table_query_macros.jl +++ b/src/table_query_macros.jl @@ -41,7 +41,7 @@ macro select(args...) elseif typeof(arg) == QuoteNode # select by name prev = :( merge($prev, QueryOperators.NamedTupleUtilities.select(_, Val($(arg)))) ) - elseif arg isa Expr && arg.head==:call && length(arg.args)==3 && arg.args[1]==Symbol(":") + elseif arg isa Expr && arg.head == :call && length(arg.args) == 3 && arg.args[1] == Symbol(":") arg = string(arg) # select by range, with multiple syntaxes supported m_range = match(r"^:([^,:]+) *: *:([^,:]+)", arg) @@ -184,7 +184,7 @@ macro mutate(args...) prev = :( Base.merge($prev, ($(arg.args[1]) = $(arg.args[2]),)) ) end - return :( Query.@map( $prev ) ) |> esc +return :( Query.@map( $prev ) ) |> esc end our_get(x) = x @@ -198,7 +198,7 @@ macro disallowna() end macro disallowna(columns...) - return :( Query.@mutate( $( ( :( $(columns[i].value) = our_get(_.$(columns[i].value)) ) for i=1:length(columns) )... ) ) ) + return :( Query.@mutate( $( ( :( $(columns[i].value) = our_get(_.$(columns[i].value)) ) for i = 1:length(columns) )... ) ) ) end # The following is a backwards compat fix @@ -206,28 +206,28 @@ macro dissallowna() return :( Query.@map(map(our_get, _)) ) end macro dissallowna(columns...) - return :( Query.@mutate( $( ( :( $(columns[i].value) = our_get(_.$(columns[i].value)) ) for i=1:length(columns) )... ) ) ) + return :( Query.@mutate( $( ( :( $(columns[i].value) = our_get(_.$(columns[i].value)) ) for i = 1:length(columns) )... ) ) ) end macro dropna() - return :( i-> i |> Query.@filter(!any(isna, _)) |> Query.@disallowna() ) + return :( i -> i |> Query.@filter(!any(isna, _)) |> Query.@disallowna() ) end macro dropna(columns...) - return :( i-> i |> Query.@filter(!any(($((:(isna(_.$(columns[i].value))) for i in 1:length(columns) )...),))) |> Query.@disallowna($(columns...)) ) + return :( i -> i |> Query.@filter(!any(($((:(isna(_.$(columns[i].value))) for i in 1:length(columns) )...),))) |> Query.@disallowna($(columns...)) ) end macro replacena(arg, args...) - if length(args)==0 && !(arg isa Expr && arg.head==:call && length(arg.args)==3 && arg.args[1]==:(=>)) - return :( Query.@map(map(i->our_get(i, $arg), _)) ) + if length(args) == 0 && !(arg isa Expr && arg.head == :call && length(arg.args) == 3 && arg.args[1] == :(=>)) + return :( Query.@map(map(i -> our_get(i, $arg), _)) ) else args = [arg; args...] - all(i isa Expr && i.head==:call && length(i.args)==3 && i.args[1]==:(=>) for i in args) || error("Invalid syntax.") + all(i isa Expr && i.head == :call && length(i.args) == 3 && i.args[1] == :(=>) for i in args) || error("Invalid syntax.") - columns = map(i->i.args[2].value, args) - replacement_values = map(i->i.args[3], args) + columns = map(i -> i.args[2].value, args) + replacement_values = map(i -> i.args[3], args) - return :( Query.@mutate( $( ( :( $(columns[i]) = our_get(_.$(columns[i]), $(replacement_values[i])) ) for i=1:length(columns) )... ) ) ) + return :( Query.@mutate( $( ( :( $(columns[i]) = our_get(_.$(columns[i]), $(replacement_values[i])) ) for i = 1:length(columns) )... ) ) ) end end diff --git a/test/runtests.jl b/test/runtests.jl index 0359ba30..30141fce 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,7 +18,7 @@ end @test Query.ismacro(:(@from 1), "@from") @test Query.ismacro(:(@from(1)), "@from") @test !Query.ismacro(:(@from 1), "@for") - @test !Query.ismacro(:(@from,1), "@from") + @test !Query.ismacro(:(@from, 1), "@from") @test Query.ismacro(:(@from 1 2 3), "@from", 4) @test !Query.ismacro(:(@from 1 2 3 4), "@from", 4) @test !Query.ismacro(:(map(1)), :map) @@ -28,9 +28,9 @@ end @test !Query.iscall(:(@from 1), Symbol("@from")) @test !Query.iscall(:(@from(1)), Symbol("@from")) @test Query.iscall(:(map(1)), :map) - @test !Query.iscall(:(map,1), :map) - @test Query.iscall(:(map(1,2,3)), :map, 3) - @test !Query.iscall(:(map(1,2,3,4)), :map, 3) + @test !Query.iscall(:(map, 1), :map) + @test Query.iscall(:(map(1, 2, 3)), :map, 3) + @test !Query.iscall(:(map(1, 2, 3, 4)), :map, 3) end @testset "shift_access!" begin @@ -44,121 +44,121 @@ end source_df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,5,2]) q = @from i in source_df begin - @where i.age>30. && i.children > 2 - @select {Name=lowercase(i.name)} + @where i.age > 30. && i.children > 2 + @select {Name = lowercase(i.name)} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(1,1) -@test q[1,:Name]=="sally" +@test size(q) == (1, 1) +@test q[1,:Name] == "sally" -source_dict = Dict("John"=>34., "Sally"=>56.) +source_dict = Dict("John" => 34., "Sally" => 56.) q = @from i in source_dict begin - @where i.second>36. - @select {Name=lowercase(i.first)} + @where i.second > 36. + @select {Name = lowercase(i.first)} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(1,1) -@test q[1,:Name]=="sally" +@test size(q) == (1, 1) +@test q[1,:Name] == "sally" q = @from i in source_dict begin - @where i.second>36. + @where i.second > 36. @select lowercase(i.first) @collect end @test isa(q, Vector{String}) -@test length(q)==1 -@test q[1]=="sally" +@test length(q) == 1 +@test q[1] == "sally" -source_array = Array{Person}(undef,0) +source_array = Array{Person}(undef, 0) push!(source_array, Person("John", ["Sally", "Miles", "Frank"])) push!(source_array, Person("Sally", ["Don", "Martin"])) q = @from i in source_array begin @where length(i.Friends) > 2 - @select {i.Name, Friendcount=length(i.Friends)} + @select {i.Name, Friendcount = length(i.Friends)} @collect end -@test isa(q,Vector{NamedTuple{(:Name,:Friendcount),Tuple{String,Int}}}) -@test length(q)==1 -@test q[1].Name=="John" -@test q[1].Friendcount==3 +@test isa(q, Vector{NamedTuple{(:Name, :Friendcount),Tuple{String,Int}}}) +@test length(q) == 1 +@test q[1].Name == "John" +@test q[1].Friendcount == 3 q = @from i in source_array begin @where length(i.Friends) > 2 - @select {i.Name, Friendcount=length(i.Friends)} + @select {i.Name, Friendcount = length(i.Friends)} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(1,2) -@test q[1,:Name]=="John" -@test q[1,:Friendcount]==3 - +@test size(q) == (1, 2) +@test q[1,:Name] == "John" +@test q[1,:Friendcount] == 3 + source_df2 = DataFrame(a=[1,2,3], b=[1.,2.,3.]) -source_it = DataFrame(c=[2.,4.,2.],d=["John","Jim","Sally"]) +source_it = DataFrame(c=[2.,4.,2.], d=["John","Jim","Sally"]) q = @from i in source_df2 begin - @join j in source_it on i.a equals convert(Int,j.c) - @select {i.a,i.b,j.c,j.d,e="Name: $(j.d)"} + @join j in source_it on i.a equals convert(Int, j.c) + @select {i.a,i.b,j.c,j.d,e = "Name: $(j.d)"} @collect DataFrame end -@test isa(q,DataFrame) -@test size(q)==(2,5) -@test q[1,:a]==2 -@test q[1,:b]==2. -@test q[1,:c]==2. -@test q[1,:d]=="John" -@test q[1,:e]=="Name: John" -@test q[2,:a]==2 -@test q[2,:b]==2. -@test q[2,:c]==2. -@test q[2,:d]=="Sally" -@test q[2,:e]=="Name: Sally" +@test isa(q, DataFrame) +@test size(q) == (2, 5) +@test q[1,:a] == 2 +@test q[1,:b] == 2. +@test q[1,:c] == 2. +@test q[1,:d] == "John" +@test q[1,:e] == "Name: John" +@test q[2,:a] == 2 +@test q[2,:b] == 2. +@test q[2,:c] == 2. +@test q[2,:d] == "Sally" +@test q[2,:e] == "Name: Sally" q = @from i in source_df2 begin @join j in (@from i in source_it begin - @where i.c<3. + @where i.c < 3. @select i - end) on i.a equals convert(Int,j.c) - @select {i.a,i.b,j.c,j.d,e="Name: $(j.d)"} + end) on i.a equals convert(Int, j.c) + @select {i.a,i.b,j.c,j.d,e = "Name: $(j.d)"} @collect DataFrame end -@test isa(q,DataFrame) -@test size(q)==(2,5) -@test q[1,:a]==2 -@test q[1,:b]==2. -@test q[1,:c]==2. -@test q[1,:d]=="John" -@test q[1,:e]=="Name: John" -@test q[2,:a]==2 -@test q[2,:b]==2. -@test q[2,:c]==2. -@test q[2,:d]=="Sally" -@test q[2,:e]=="Name: Sally" +@test isa(q, DataFrame) +@test size(q) == (2, 5) +@test q[1,:a] == 2 +@test q[1,:b] == 2. +@test q[1,:c] == 2. +@test q[1,:d] == "John" +@test q[1,:e] == "Name: John" +@test q[2,:a] == 2 +@test q[2,:b] == 2. +@test q[2,:c] == 2. +@test q[2,:d] == "Sally" +@test q[2,:e] == "Name: Sally" q = @from i in source_df begin @let count = length(i.name) @let kids_per_year = i.children / i.age @where count > 4 - @select {Name=i.name, Count=count, KidsPerYear=kids_per_year} + @select {Name = i.name, Count = count, KidsPerYear = kids_per_year} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(1,3) -@test q[1,:Name]=="Sally" -@test q[1,:Count]==5 -@test q[1,:KidsPerYear]≈0.119047619047 +@test size(q) == (1, 3) +@test q[1,:Name] == "Sally" +@test q[1,:Count] == 5 +@test q[1,:KidsPerYear] ≈ 0.119047619047 q = @from i in source_df begin @orderby i.age @@ -167,30 +167,30 @@ q = @from i in source_df begin end @test isa(q, Vector{String}) -@test length(q)==3 -@test q==["john", "sally", "kirk"] +@test length(q) == 3 +@test q == ["john", "sally", "kirk"] q = @from i in source_df begin @orderby descending(i.age) @select lowercase(i.name) @collect end - + @test isa(q, Vector{String}) -@test length(q)==3 -@test q==["kirk", "sally", "john"] +@test length(q) == 3 +@test q == ["kirk", "sally", "john"] q = @from i in source_df begin @orderby ascending(i.age) - @select lowercase(i.name) + @select lowercase(i.name) @collect end @test isa(q, Vector{String}) -@test length(q)==3 -@test q==["john", "sally", "kirk"] +@test length(q) == 3 +@test q == ["john", "sally", "kirk"] -source_nestedsort = [(4,3),(4,3),(1,2),(1,1)] +source_nestedsort = [(4, 3),(4, 3),(1, 2),(1, 1)] q = @from i in source_nestedsort begin @orderby i[1], descending(i[2]) @select i @@ -198,8 +198,8 @@ q = @from i in source_nestedsort begin end @test isa(q, Vector{Tuple{Int,Int}}) -@test length(q)==4 -@test q==[(1,2),(1,1),(4,3),(4,3)] +@test length(q) == 4 +@test q == [(1, 2),(1, 1),(4, 3),(4, 3)] # We need to use a typed const here, otherwise type inference stands no chance @@ -207,17 +207,17 @@ closure_var_1::Int = 1 q = @from i in source_df begin @let k = i.children + closure_var_1 - @join j in source_df2 on i.children*closure_var_1 equals j.a*closure_var_1 - @where i.age>closure_var_1 - @orderby i.age*closure_var_1 + @join j in source_df2 on i.children * closure_var_1 equals j.a * closure_var_1 + @where i.age > closure_var_1 + @orderby i.age * closure_var_1 @select i.children + closure_var_1 @collect end @test isa(q, Vector{Int}) -@test length(q)==2 -@test q[1]==4 -@test q[2]==3 +@test length(q) == 2 +@test q[1] == 4 +@test q[2] == 3 q = @from i in [5,4,4,6,1] begin @orderby i @@ -225,9 +225,9 @@ q = @from i in [5,4,4,6,1] begin @collect end -@test isa(q,Vector{Int}) -@test length(q)==5 -@test q==[1,4,4,5,6] +@test isa(q, Vector{Int}) +@test length(q) == 5 +@test q == [1,4,4,5,6] q = @from i in [5,4,4,6,1] begin @orderby descending(i) @@ -235,9 +235,9 @@ q = @from i in [5,4,4,6,1] begin @collect end -@test isa(q,Vector{Int}) -@test length(q)==5 -@test q==[6,5,4,4,1] +@test isa(q, Vector{Int}) +@test length(q) == 5 +@test q == [6,5,4,4,1] # Test phase 3 query translation @@ -245,73 +245,73 @@ q = @from i in source_array begin @select i @collect end - -@test isa(q,Vector{Person}) -@test length(q)==2 -@test q[1].Name=="John" -@test q[1].Friends==["Sally", "Miles", "Frank"] -@test q[2].Name=="Sally" -@test q[2].Friends==["Don", "Martin"] + +@test isa(q, Vector{Person}) +@test length(q) == 2 +@test q[1].Name == "John" +@test q[1].Friends == ["Sally", "Miles", "Frank"] +@test q[2].Name == "Sally" +@test q[2].Friends == ["Don", "Martin"] q = @from i in source_df begin @from j in source_df2 - @select {Name=i.name,Age=i.age,Children=i.children,A=j.a,B=j.b} + @select {Name = i.name,Age = i.age,Children = i.children,A = j.a,B = j.b} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(9,5) -@test q.Name==["John","John","John","Sally","Sally","Sally","Kirk","Kirk","Kirk"] -@test q.Age==[23.,23.,23.,42.,42.,42.,59.,59.,59.] -@test q.Children==[3,3,3,5,5,5,2,2,2] -@test q.A==[1,2,3,1,2,3,1,2,3] -@test q.B==[1.,2.,3.,1.,2.,3.,1.,2.,3.] +@test size(q) == (9, 5) +@test q.Name == ["John","John","John","Sally","Sally","Sally","Kirk","Kirk","Kirk"] +@test q.Age == [23.,23.,23.,42.,42.,42.,59.,59.,59.] +@test q.Children == [3,3,3,5,5,5,2,2,2] +@test q.A == [1,2,3,1,2,3,1,2,3] +@test q.B == [1.,2.,3.,1.,2.,3.,1.,2.,3.] -source_nested_dict = Dict(:a=>[1,2,3], :b=>[4,5]) +source_nested_dict = Dict(:a => [1,2,3], :b => [4,5]) q = @from i in source_nested_dict begin @from j in i.second - @select {Key=i.first,Value=j} + @select {Key = i.first,Value = j} @collect end -@test isa(q, Vector{NamedTuple{(:Key,:Value),Tuple{Symbol,Int}}}) -@test length(q)==5 -@test in((Key=:a,Value=1), q) -@test in((Key=:a,Value=2), q) -@test in((Key=:a,Value=3), q) -@test in((Key=:b,Value=4), q) -@test in((Key=:b,Value=5), q) +@test isa(q, Vector{NamedTuple{(:Key, :Value),Tuple{Symbol,Int}}}) +@test length(q) == 5 +@test in((Key = :a, Value = 1), q) +@test in((Key = :a, Value = 2), q) +@test in((Key = :a, Value = 3), q) +@test in((Key = :b, Value = 4), q) +@test in((Key = :b, Value = 5), q) q = @from i in source_df begin @from j in source_df2 - @where j.a>1 - @select {Name=i.name,Age=i.age,Children=i.children,A=j.a,B=j.b} + @where j.a > 1 + @select {Name = i.name,Age = i.age,Children = i.children,A = j.a,B = j.b} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(6,5) -@test q.Name==["John","John","Sally","Sally","Kirk","Kirk"] -@test q.Age==[23.,23.,42.,42.,59.,59.] -@test q.Children==[3,3,5,5,2,2] -@test q.A==[2,3,2,3,2,3] -@test q.B==[2.,3.,2.,3.,2.,3.] +@test size(q) == (6, 5) +@test q.Name == ["John","John","Sally","Sally","Kirk","Kirk"] +@test q.Age == [23.,23.,42.,42.,59.,59.] +@test q.Children == [3,3,5,5,2,2] +@test q.A == [2,3,2,3,2,3] +@test q.B == [2.,3.,2.,3.,2.,3.] -source_nested_dict = Dict(:a=>[1,2,3], :b=>[4,5]) +source_nested_dict = Dict(:a => [1,2,3], :b => [4,5]) q = @from i in source_nested_dict begin @from j in i.second - @where j>2 - @select {Key=i.first,Value=j} + @where j > 2 + @select {Key = i.first,Value = j} @collect end -@test isa(q, Vector{NamedTuple{(:Key,:Value),Tuple{Symbol,Int}}}) -@test length(q)==3 -@test in((Key=:a,Value=3), q) -@test in((Key=:b,Value=4), q) -@test in((Key=:b,Value=5), q) +@test isa(q, Vector{NamedTuple{(:Key, :Value),Tuple{Symbol,Int}}}) +@test length(q) == 3 +@test in((Key = :a, Value = 3), q) +@test in((Key = :b, Value = 4), q) +@test in((Key = :b, Value = 5), q) source_df_groupby = DataFrame(name=["John", "Sally", "Kirk"], children=[3,2,2]) @@ -321,90 +321,90 @@ x = @from i in source_df_groupby begin end @test isa(x, Array{Grouping{Int,String}}) -@test length(x)==2 -@test key(x[1])==3 -@test x[1][:]==["John"] -@test key(x[2])==2 -@test x[2][:]==["Sally", "Kirk"] +@test length(x) == 2 +@test key(x[1]) == 3 +@test x[1][:] == ["John"] +@test key(x[2]) == 2 +@test x[2][:] == ["Sally", "Kirk"] x = @from i in source_df_groupby begin @group i by i.children @collect end -@test isa(x, Vector{Grouping{Int,NamedTuple{(:name,:children),Tuple{String,Int}}}}) -@test length(x)==2 -@test key(x[1])==3 -@test x[1][1].name=="John"; -@test key(x[2])==2 -@test x[2][1].name=="Sally"; -@test x[2][2].name=="Kirk"; +@test isa(x, Vector{Grouping{Int,NamedTuple{(:name, :children),Tuple{String,Int}}}}) +@test length(x) == 2 +@test key(x[1]) == 3 +@test x[1][1].name == "John"; +@test key(x[2]) == 2 +@test x[2][1].name == "Sally"; +@test x[2][2].name == "Kirk"; q = @from i in source_df_groupby begin @group i by i.children into g - @select {Children=key(g),Number_of_parents=length(g)} + @select {Children = key(g),Number_of_parents = length(g)} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(2,2) -@test q.Children==[3,2] -@test q.Number_of_parents==[1,2] +@test size(q) == (2, 2) +@test q.Children == [3,2] +@test q.Number_of_parents == [1,2] q = @from i in source_df begin - @where i.age>30. && i.children > 2 + @where i.age > 30. && i.children > 2 @select i into j - @select {Name=lowercase(j.name)} + @select {Name = lowercase(j.name)} @collect DataFrame end @test isa(q, DataFrame) -@test size(q)==(1,1) -@test q[1,:Name]=="sally" +@test size(q) == (1, 1) +@test q[1,:Name] == "sally" q = @from i in source_df2 begin - @join j in source_it on i.a equals convert(Int,j.c) into k - @select {i.a,i.b,c=k} + @join j in source_it on i.a equals convert(Int, j.c) into k + @select {i.a,i.b,c = k} @collect end -@test isa(q,Vector{NamedTuple{(:a,:b,:c),Tuple{Int,Float64,Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}}}}) -@test length(q)==3 +@test isa(q, Vector{NamedTuple{(:a, :b, :c),Tuple{Int,Float64,Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}}}}) +@test length(q) == 3 @test q[1].a == 1 -@test q[1].b==1. -@test isa(q[1].c, Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}) -@test length(q[1].c)==0 -@test q[2].a==2 -@test q[2].b==2. -@test isa(q[2].c, Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}) -@test length(q[2].c)==2 -@test q[2].c[1].c==2. -@test q[2].c[1].d== "John" -@test q[2].c[2].c==2. -@test q[2].c[2].d== "Sally" -@test q[3].a==3 -@test q[3].b==3. -@test isa(q[3].c, Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}) -@test length(q[3].c)==0 +@test q[1].b == 1. +@test isa(q[1].c, Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}) +@test length(q[1].c) == 0 +@test q[2].a == 2 +@test q[2].b == 2. +@test isa(q[2].c, Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}) +@test length(q[2].c) == 2 +@test q[2].c[1].c == 2. +@test q[2].c[1].d == "John" +@test q[2].c[2].c == 2. +@test q[2].c[2].d == "Sally" +@test q[3].a == 3 +@test q[3].b == 3. +@test isa(q[3].c, Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}) + @test length(q[3].c) == 0 q = @from i in source_df2 begin - @join j in source_it on i.a equals convert(Int,j.c) into k - @where length(k)>0 - @select {i.a,i.b,c=k} + @join j in source_it on i.a equals convert(Int, j.c) into k + @where length(k) > 0 + @select {i.a,i.b,c = k} @collect end -@test isa(q,Vector{NamedTuple{(:a,:b,:c),Tuple{Int,Float64,Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}}}}) -@test length(q)==1 -@test q[1].a==2 -@test q[1].b==2. -@test isa(q[1].c, Vector{NamedTuple{(:c,:d),Tuple{Float64,String}}}) -@test length(q[1].c)==2 -@test q[1].c[1].c==2. -@test q[1].c[1].d== "John" -@test q[1].c[2].c==2. -@test q[1].c[2].d== "Sally" +@test isa(q, Vector{NamedTuple{(:a, :b, :c),Tuple{Int,Float64,Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}}}}) +@test length(q) == 1 +@test q[1].a == 2 +@test q[1].b == 2. +@test isa(q[1].c, Vector{NamedTuple{(:c, :d),Tuple{Float64,String}}}) +@test length(q[1].c) == 2 +@test q[1].c[1].c == 2. + @test q[1].c[1].d == "John" +@test q[1].c[2].c == 2. +@test q[1].c[2].d == "Sally" source_df_nulls = DataFrame(name=["John", "Sally", missing, "Kirk"], age=[23., 42., 54., 59.], children=[3,missing,8,2]) q = @from i in source_df_nulls begin @@ -413,24 +413,24 @@ q = @from i in source_df_nulls begin end @test isa(q, DataFrame) -@test size(q)==(4,3) -@test q[1,:name]=="John" -@test q[2,:name]=="Sally" -@test q[3,:name]===missing -@test q[4,:name]=="Kirk" -@test q.age==[23., 42., 54., 59.] -@test q[1,:children]==3 -@test q[2,:children]===missing -@test q[3,:children]==8 -@test q[4,:children]==2 +@test size(q) == (4, 3) +@test q[1,:name] == "John" +@test q[2,:name] == "Sally" +@test q[3,:name] === missing +@test q[4,:name] == "Kirk" +@test q.age == [23., 42., 54., 59.] +@test q[1,:children] == 3 +@test q[2,:children] === missing +@test q[3,:children] == 8 +@test q[4,:children] == 2 q = collect(QueryOperators.default_if_empty(DataValue{String}[])) -@test length(q)==1 -@test isna(q[1]) +@test length(q) == 1 + @test isna(q[1]) q = collect(QueryOperators.default_if_empty(DataValue{String}["John", "Sally"])) -@test length(q)==2 -@test q==DataValue{String}["John", "Sally"] +@test length(q) == 2 +@test q == DataValue{String}["John", "Sally"] source_df3 = DataFrame(c=Union{Int,Missing}[2,4,2], d=Union{String,Missing}["John", "Jim","Sally"]) @@ -441,34 +441,34 @@ q = @from i in source_df2 begin end @test isa(q, DataFrame) -@test size(q)==(4,4) -@test q.a==[1,2,2,3] -@test q.b==[1.,2.,2.,3.] -@test q[1,:c]===missing -@test q[2,:c]==2 -@test q[3,:c]==2 -@test q[4,:c]===missing -@test q[1,:d]===missing -@test q[2,:d]=="John" -@test q[3,:d]=="Sally" -@test q[4,:d]===missing +@test size(q) == (4, 4) +@test q.a == [1,2,2,3] +@test q.b == [1.,2.,2.,3.] +@test q[1,:c] === missing +@test q[2,:c] == 2 +@test q[3,:c] == 2 +@test q[4,:c] === missing + @test q[1,:d] === missing +@test q[2,:d] == "John" +@test q[3,:d] == "Sally" +@test q[4,:d] === missing q = @from i in source_df begin - @select i.name=>i.children + @select i.name => i.children @collect Dict end @test isa(q, Dict{String,Int}) -@test length(q)==3 -@test q["John"]==3 -@test q["Sally"]==5 -@test q["Kirk"]==2 +@test length(q) == 3 + @test q["John"] == 3 + @test q["Sally"] == 5 +@test q["Kirk"] == 2 q = @from i in source_df begin @let j = i.name @let k = i.children @let l = i.age - @select {a=j, b=k, c=l} + @select {a = j, b = k, c = l} @collect DataFrame end @@ -476,21 +476,21 @@ end @test q.b == [3, 5, 2] @test q.c == [23., 42., 59.] -@test @count(source_df)==3 -@test @count(source_df, i->i.children>3)==1 +@test @count(source_df) == 3 +@test @count(source_df, i -> i.children > 3) == 1 -q = DataFrame(@filter(source_df, i->i.age>30. && i.children > 2)) +q = DataFrame(@filter(source_df, i -> i.age > 30. && i.children > 2)) -@test isa(q, DataFrame) -@test size(q)==(1,3) -@test q[1,:name]=="Sally" -@test q[1,:age]==42. -@test q[1,:children]==5 + @test isa(q, DataFrame) + @test size(q) == (1, 3) + @test q[1,:name] == "Sally" +@test q[1,:age] == 42. +@test q[1,:children] == 5 -q = collect(@map(source_df, i->i.children)) +q = collect(@map(source_df, i -> i.children)) @test isa(q, Vector{Int}) -@test q==[3,5,2] +@test q == [3,5,2] include("test_dplyr-syntax.jl") include("test_pipesyntax.jl") @@ -500,6 +500,6 @@ include("test_macros.jl") # Int32 otherwise. Also only run on Julia 1.6 and newer, because # a lot of output printing was changed and doctests now can't be written # to work on multiple Julia versions. -Int==Int64 && VERSION>=v"1.6" && doctest(Query) +Int == Int64 && VERSION >= v"1.6" && doctest(Query) end diff --git a/test/test_dplyr-syntax.jl b/test/test_dplyr-syntax.jl index 3f1a21a8..80a20280 100644 --- a/test/test_dplyr-syntax.jl +++ b/test/test_dplyr-syntax.jl @@ -7,18 +7,18 @@ using Test @testset "a.b Syntax (dplyr API)" begin - df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), - age=vcat([10., 20., 30.],[10., 20., 30.].+3), - children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + df = DataFrame(name=repeat(["John", "Sally", "Kirk"], inner=[1], outer=[2]), + age=vcat([10., 20., 30.], [10., 20., 30.] .+ 3), + children=repeat([3,2,2], inner=[1], outer=[2]),state=[:a,:a,:a,:b,:b,:b]) x = @from i in df begin @group i by i.state into g - @select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)} + @select {group = key(g),mage = mean(g.age), oldest = maximum(g.age), youngest = minimum(g.age)} @collect DataFrame end @test x isa DataFrame - @test size(x) == (2,4) + @test size(x) == (2, 4) @test x[1,:mage] == 20 @test x[2,:mage] == 23 @test x[1,:oldest] == 30 diff --git a/test/test_pipesyntax.jl b/test/test_pipesyntax.jl index 4ddf0727..b2928ad9 100644 --- a/test/test_pipesyntax.jl +++ b/test/test_pipesyntax.jl @@ -7,12 +7,12 @@ using Test df = DataFrame(a=[1,2,3], b=[3.,2.,1.], c=["a", "b", "c"]) df2 = df |> @query(i, begin - @where i.a>2 + @where i.a > 2 @select {i.c, i.b} end) |> DataFrame @test df2 isa DataFrame - @test size(df2) == (1,2) + @test size(df2) == (1, 2) @test df2[1,:c] == "c" @test df2[1,:b] == 1. end diff --git a/test/test_standalone.jl b/test/test_standalone.jl index 4ed987ac..9253cfa7 100644 --- a/test/test_standalone.jl +++ b/test/test_standalone.jl @@ -4,33 +4,33 @@ using Test @testset "Standalone Syntax" begin -@testset "@take operator" begin - df = DataFrame(a=[1,2,3], b=[3.,2.,1.], c=["a", "b", "c"]) + @testset "@take operator" begin + df = DataFrame(a=[1,2,3], b=[3.,2.,1.], c=["a", "b", "c"]) - df2 = df |> @take(2) |> DataFrame + df2 = df |> @take(2) |> DataFrame - @test df2 isa DataFrame - @test size(df2) == (2,3) - @test df2[:a] == [1,2] - @test df2[:b] == [3.,2.] - @test df2[:c] == ["a", "b"] + @test df2 isa DataFrame + @test size(df2) == (2, 3) + @test df2[:a] == [1,2] + @test df2[:b] == [3.,2.] + @test df2[:c] == ["a", "b"] - df2 = DataFrame(@take(df, 2)) + df2 = DataFrame(@take(df, 2)) - @test df2 isa DataFrame - @test size(df2) == (2,3) - @test df2[:a] == [1,2] - @test df2[:b] == [3.,2.] - @test df2[:c] == ["a", "b"] -end + @test df2 isa DataFrame + @test size(df2) == (2, 3) + @test df2[:a] == [1,2] + @test df2[:b] == [3.,2.] + @test df2[:c] == ["a", "b"] + end @testset "@drop operator" begin df = DataFrame(a=[1,2,3], b=[3.,2.,1.], c=["a", "b", "c"]) df2 = df |> @drop(1) |> DataFrame - + @test df2 isa DataFrame - @test size(df2) == (2,3) + @test size(df2) == (2, 3) @test df2[:a] == [2,3] @test df2[:b] == [2.,1.] @test df2[:c] == ["b","c"] @@ -38,7 +38,7 @@ end df2 = DataFrame(@drop(df, 1)) @test df2 isa DataFrame - @test size(df2) == (2,3) + @test size(df2) == (2, 3) @test df2[:a] == [2,3] @test df2[:b] == [2.,1.] @test df2[:c] == ["b","c"] @@ -47,8 +47,8 @@ end @testset "@unique operator" begin df = DataFrame(a=[1,2,1], b=[3.,3.,3.]) - @test df |> @unique() |> collect == [(a=1,b=3.), (a=2,b=3.)] - @test df |> @unique(_.b) |> collect == [(a=1,b=3.)] + @test df |> @unique() |> collect == [(a = 1, b = 3.), (a = 2, b = 3.)] + @test df |> @unique(_.b) |> collect == [(a = 1, b = 3.)] end end