From bad481c139dcc4d3f1e61ebd4c3fe57cd8062232 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Fri, 10 Dec 2021 14:52:30 -0500
Subject: [PATCH 1/5] speed up `prime(n)`

---
 src/Primes.jl | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/Primes.jl b/src/Primes.jl
index 4cf3f0e..bddd123 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -704,8 +704,21 @@ julia> prime(3)
 
 ```
 """
-prime(::Type{T}, i::Integer) where {T<:Integer} = i < 0 ? throw(DomainError(i)) : nextprime(T(2), i)
-prime(i::Integer) = prime(Int, i)
+prime(::Type{T}, n::Integer) where {T<:Integer} = T(prime(n))
+function prime(n::Integer)
+    n = Int(n)
+    n < 0 && throw(DomainError(i))
+    n <= length(PRIMES) && return PRIMES[n]
+    n -= length(PRIMES)
+    p = 2^16
+    for q in 2 .^ (28:-4:8)
+        while n >= q
+            n -= sum(_primesmask(p,p+q))
+            p += q
+        end
+    end
+    return nextprime(p, n)
+end
 
 
 struct NextPrimes{T<:Integer}

From 7a30776a3a78c870d13e50a461da4d3165abf622 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Fri, 10 Dec 2021 16:02:58 -0500
Subject: [PATCH 2/5] 80% speedup of Primes._primesmask

---
 src/Primes.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Primes.jl b/src/Primes.jl
index bddd123..f5d72b3 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -35,7 +35,7 @@ function _primesmask(limit::Int)
     limit < 7 && throw(ArgumentError("The condition limit ≥ 7 must be met."))
     n = wheel_index(limit)
     m = wheel_prime(n)
-    sieve = ones(Bool, n)
+    sieve = trues(n)
     @inbounds for i = 1:wheel_index(isqrt(limit))
         if sieve[i]
             p = wheel_prime(i)
@@ -56,7 +56,7 @@ function _primesmask(lo::Int, hi::Int)
     lo == 7 && return _primesmask(hi)
     wlo, whi = wheel_index(lo - 1), wheel_index(hi)
     m = wheel_prime(whi)
-    sieve = ones(Bool, whi - wlo)
+    sieve = trues(whi - wlo)
     hi < 49 && return sieve
     small_sieve = _primesmask(isqrt(hi))
     @inbounds for i = 1:length(small_sieve)  # don't use eachindex here

From f2c4b0f199e7eddf6ae941b2fb8c87fb9db9f765 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Sun, 19 Dec 2021 12:10:00 -0600
Subject: [PATCH 3/5] revet `Vector{Bool}` to `BitArray`

---
 src/Primes.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Primes.jl b/src/Primes.jl
index f5d72b3..bddd123 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -35,7 +35,7 @@ function _primesmask(limit::Int)
     limit < 7 && throw(ArgumentError("The condition limit ≥ 7 must be met."))
     n = wheel_index(limit)
     m = wheel_prime(n)
-    sieve = trues(n)
+    sieve = ones(Bool, n)
     @inbounds for i = 1:wheel_index(isqrt(limit))
         if sieve[i]
             p = wheel_prime(i)
@@ -56,7 +56,7 @@ function _primesmask(lo::Int, hi::Int)
     lo == 7 && return _primesmask(hi)
     wlo, whi = wheel_index(lo - 1), wheel_index(hi)
     m = wheel_prime(whi)
-    sieve = trues(whi - wlo)
+    sieve = ones(Bool, whi - wlo)
     hi < 49 && return sieve
     small_sieve = _primesmask(isqrt(hi))
     @inbounds for i = 1:length(small_sieve)  # don't use eachindex here

From 1daa09637bb6121707b37d553246540144d3656d Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Wed, 22 Dec 2021 01:03:37 -0500
Subject: [PATCH 4/5] even bigger speedup

---
 Project.toml  |  11 +++--
 src/Primes.jl | 108 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/Project.toml b/Project.toml
index bc44ed7..739b32d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,13 +2,16 @@ name = "Primes"
 uuid = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 version = "0.5.1"
 
+[deps]
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+
+[compat]
+DataStructures = "0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17"
+julia = "1"
+
 [extras]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["DataStructures", "Test"]
-
-[compat]
-DataStructures = "0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17"
-julia = "1"
diff --git a/src/Primes.jl b/src/Primes.jl
index f5d72b3..f832662 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -3,6 +3,7 @@
 module Primes
 
 using Base.Iterators: repeated
+using SpecialFunctions #zeta function
 
 import Base: iterate, eltype, IteratorSize, IteratorEltype
 using Base: BitSigned
@@ -690,6 +691,97 @@ function prevprime(n::Integer, i::Integer=1; interval::Integer=1)
     n
 end
 
+"""
+    inthroot(n::Integer, r::Int)
+computes floor(n^(1/r)) precisely
+"""
+function inthroot(n::T, r::Int) where T <: Integer
+    ans = BigInt()
+    ccall((:__gmpz_root, :libgmp), Cvoid, (Ref{BigInt}, Ref{BigInt}, Culong), ans, BigInt(n), r)
+    return T(ans)
+end
+
+function _phi(n::Integer, a::Integer, prime_cache, sign=1)
+    a == 0 && return sign*n
+    a == 1 && return sign*(n-n÷2)
+    a == 2 && return sign*(n-n÷2 + n÷6 - n÷3)
+    a == 3 && return sign*(n - n÷2 + n÷6 - n÷3 - n÷5 + n÷10 + n÷15 - n÷30)
+    result = 0
+    while a>3
+        if n < prime_cache[a+1]
+            return result + sign
+        end
+        a -= 1
+        result += _phi(n÷prime_cache[a+1], a, prime_cache, -sign)
+    end
+    return result + _phi(n,a, prime_cache, sign)
+end
+
+"""
+computes the number of primes p<=n
+impliments the Meissel–Lehmer algorithm
+https://en.wikipedia.org/wiki/Meissel%E2%80%93Lehmer_algorithm
+theoretically, this is O(x/ln(x)^4) time, O(sqrt(x)) space
+"""
+function _pi(n::Integer, prime_cache=nothing)
+    n <= 2^16 && return searchsortedlast(PRIMES, n)
+    b = isqrt(n)
+    prime_cache==nothing && (prime_cache = primes(round(Int,b*log(b))))
+    n <= prime_cache[end] && return searchsortedlast(prime_cache, n)
+    a = _pi(inthroot(n, 4), prime_cache)
+    b = _pi(isqrt(n), prime_cache)
+    c = _pi(inthroot(n, 3), prime_cache)
+    result = _phi(n, a, prime_cache) + ((b + a - 2) * (b - a + 1)) ÷ 2
+    for i in a:(b-1)
+        nop = n ÷ prime_cache[i+1]
+        result -= _pi(nop, prime_cache)
+        if i<c
+            b_i = _pi(isqrt(nop), prime_cache)
+            for j in i:(b_i-1)
+                result -= _pi(nop ÷ prime_cache[j+1], prime_cache) - j
+            end
+        end
+    end
+    result
+end
+
+"""
+extremely accurate estimate of pi(n)
+it is a slighty modified version of the Gram series
+(also known as the Riemann prime counting function)
+https://mathworld.wolfram.com/GramSeries.html
+"""
+function _pi_est(x)
+    logx = log(x)
+    logxtk = logx
+    kfac = one(logx)
+    res = 1 - 1/logx + atan(pi/logx)/pi
+    for k in 1:10000
+        term = logxtk/(kfac*zeta(k+1)*k)
+        res += term
+        term <= eps(res) && return res
+        kfac *= k+1
+        logxtk *= logx
+    end
+end
+
+"""
+extremely accurate estimate of prime(n)
+uses Newton iteration to compute pi_est inverse
+using the fact that pi_est'(x) = 1 / log(x).
+"""
+function prime_est(x)
+    x < 2 && return 0
+    guess = x*(log(x) + log(log(x)) - 1)
+    old_term = Inf
+    while true
+        term = (_pi_est(guess) - x)*log(guess)
+        abs(term) >= abs(old_term) && return guess
+        guess -= term
+        old_term = term
+    end
+end
+
 """
     prime(::Type{<:Integer}=Int, i::Integer)
 
@@ -708,16 +800,12 @@ prime(::Type{T}, n::Integer) where {T<:Integer} = T(prime(n))
 function prime(n::Integer)
     n = Int(n)
     n < 0 && throw(DomainError(i))
-    n <= length(PRIMES) && return PRIMES[n]
-    n -= length(PRIMES)
-    p = 2^16
-    for q in 2 .^ (28:-4:8)
-        while n >= q
-            n -= sum(_primesmask(p,p+q))
-            p += q
-        end
-    end
-    return nextprime(p, n)
+    n < length(PRIMES) && return PRIMES[n]
+    # the closer prime_est is, the less work this has to do.
+    p = round(Int, prime_est(big(n)))
+    n -= _pi(p)
+    n == 0 && return p
+    n >= 0 ? nextprime(p, n) : nextprime(p+1, n-1) # in case p is prime
 end
 
 

From acf0c2fe58fc32f7a7bf6a725c803869ef8ce890 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith@gmail.com>
Date: Fri, 24 Dec 2021 02:09:52 -0500
Subject: [PATCH 5/5] better perf

---
 src/Primes.jl | 66 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/src/Primes.jl b/src/Primes.jl
index a60047d..19df529 100644
--- a/src/Primes.jl
+++ b/src/Primes.jl
@@ -5,7 +5,7 @@ module Primes
 using Base.Iterators: repeated
 using SpecialFunctions #zeta function
 
-import Base: iterate, eltype, IteratorSize, IteratorEltype
+import Base: iterate, eltype, IteratorSize, IteratorEltype, Threads
 using Base: BitSigned
 using Base.Checked: checked_neg
 
@@ -695,26 +695,32 @@ end
     inthroot(n::Integer, r::Int)
 computes floor(n^(1/r)) precisely
 """
-function inthroot(n::T, r::Int) where T <: Integer
+function inthroot(x::BigInt, n::Int)
     ans = BigInt()
-    ccall((:__gmpz_root, :libgmp), Cvoid, (Ref{BigInt}, Ref{BigInt}, Culong), ans, BigInt(n), r)
-    return T(ans)
+    ccall((:__gmpz_root, :libgmp), Cvoid, (Ref{BigInt}, Ref{BigInt}, Culong), ans, BigInt(x), n)
+    ans
+end
+function inthroot(x::Int64, n::Int)
+    u, s = 1<<((64-leading_zeros(x)) ÷ n), x
+    while u != s
+        s = u
+        t = (n-1) * s + x ÷ (s ^ (n-1))
+        u = t ÷ n
+    end
+    return s
 end
 
-function _phi(n::Integer, a::Integer, prime_cache, sign=1)
-    a == 0 && return sign*n
-    a == 1 && return sign*(n-n÷2)
-    a == 2 && return sign*(n-n÷2 + n÷6 - n÷3)
-    a == 3 && return sign*(n - n÷2 + n÷6 - n÷3 - n÷5 + n÷10 + n÷15 - n÷30)
-    result = 0
-    while a>3
-        if n < prime_cache[a+1]
-            return result + sign
-        end
-        a -= 1
-        result += _phi(n÷prime_cache[a+1], a, prime_cache, -sign)
+function _phi(n::Integer, a::Integer, prime_cache)
+    a == 3 && return (n - n÷2 + n÷6 - n÷3 - n÷5 + n÷10 + n÷15 - n÷30)
+    a == 2 && return (n - n÷2 + n÷6 - n÷3)
+    a == 1 && return (n - n÷2)
+    a <= 0 && return n
+    a >= _pi_upper(n) && return 1
+    if a > _pi_upper(isqrt(n))
+        return max(_pi(n, prime_cache) - a, 0) + 1
     end
-    return result + _phi(n,a, prime_cache, sign)
+    result = @inbounds sum(_phi(n÷prime_cache[a], a-1, prime_cache) for a in 4:a)
+    return result + _phi(n, 3, prime_cache)
 end
 
 """
@@ -723,28 +729,38 @@ impliments the Meissel–Lehmer algorithm
 https://en.wikipedia.org/wiki/Meissel%E2%80%93Lehmer_algorithm
 theoretically, this is O(x/ln(x)^4) time, O(sqrt(x)) space
 """
-function _pi(n::Integer, prime_cache=nothing)
+function _pi(n::Integer)
     n <= 2^16 && return searchsortedlast(PRIMES, n)
     b = isqrt(n)
-    prime_cache==nothing && (prime_cache = primes(round(Int,b*log(b))))
+    b = round(Int,b*log(b))
+    _pi(n, b<=2^16 ? PRIMES : primes(b))
+end
+function _pi(n::Integer, prime_cache)
     n <= prime_cache[end] && return searchsortedlast(prime_cache, n)
     a = _pi(inthroot(n, 4), prime_cache)
     b = _pi(isqrt(n), prime_cache)
     c = _pi(inthroot(n, 3), prime_cache)
     result = _phi(n, a, prime_cache) + ((b + a - 2) * (b - a + 1)) ÷ 2
-    for i in a:(b-1)
+    for i in a:(c-1)
         nop = n ÷ prime_cache[i+1]
         result -= _pi(nop, prime_cache)
-        if i<c
-            b_i = _pi(isqrt(nop), prime_cache)
-            for j in i:(b_i-1)
-                result -= _pi(nop ÷ prime_cache[j+1], prime_cache) - j
-            end
+        b_i = _pi(isqrt(nop), prime_cache)
+        for j in i:(b_i-1)
+            result -= _pi(nop ÷ prime_cache[j+1], prime_cache) - j
         end
     end
+    for i in c:(b-1)
+        nop = n ÷ prime_cache[i+1]
+        result -= _pi(nop, prime_cache)
+    end
     result
 end
 
+function _pi_upper(n)
+    n <= 2^16 && return searchsortedlast(PRIMES, n)
+    round(Int, 10+n/(log(n)-1.1))
+end
+
 """
 extremely accurate estimate of pi(n)
 it is a slighty modified version of the Gram series