N5N3 · N5N3 · Jul 6, 2021 · Jul 5, 2021 · Jul 5, 2021 · Jul 5, 2021
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
@@ -884,34 +884,6 @@ end
 
 ## from general iterable to any array
 
-"""
-    copyto!(dest::AbstractArray, src) -> dest
-
-Copy all elements from collection `src` to array `dest`, whose length must be greater than
-or equal to the length `n` of `src`. The first `n` elements of `dest` are overwritten,
-the other elements are left untouched.
-
-See also [`copy!`](@ref Base.copy!), [`copy`](@ref).
-
-# Examples
-```jldoctest
-julia> x = [1., 0., 3., 0., 5.];
-
-julia> y = zeros(7);
-
-julia> copyto!(y, x);
-
-julia> y
-7-element Vector{Float64}:
- 1.0
- 0.0
- 3.0
- 0.0
- 5.0
- 0.0
- 0.0
-```
-"""
 function copyto!(dest::AbstractArray, src)
     destiter = eachindex(dest)
     y = iterate(destiter)
@@ -989,6 +961,65 @@ function copyto!(dest::AbstractArray, dstart::Integer, src, sstart::Integer, n::
     return dest
 end
 
+## copy between abstract arrays - generally more efficient
+## since a single index variable can be used.
+
+"""
+    copyto!(dest::AbstractArray, src) -> dest
+
+Copy all elements from collection `src` to array `dest`, whose length must be greater than
+or equal to the length `n` of `src`. The first `n` elements of `dest` are overwritten,
+the other elements are left untouched.
+
+See also [`copy!`](@ref Base.copy!), [`copy`](@ref).
+
+# Examples
+```jldoctest
+julia> x = [1., 0., 3., 0., 5.];
+
+julia> y = zeros(7);
+
+julia> copyto!(y, x);
+
+julia> y
+7-element Vector{Float64}:
+ 1.0
+ 0.0
+ 3.0
+ 0.0
+ 5.0
+ 0.0
+ 0.0
+```
+"""
+function copyto!(dest::AbstractArray, src::AbstractArray)
+    isempty(src) && return dest
+    src′ = unalias(dest, src)
+    copyto_unaliased!(IndexStyle(dest), dest, IndexStyle(src′), src′)
+end
+
+function copyto!(deststyle::IndexStyle, dest::AbstractArray, srcstyle::IndexStyle, src::AbstractArray)
+    isempty(src) && return dest
+    src′ = unalias(dest, src)
+    copyto_unaliased!(deststyle, dest, srcstyle, src′)
+end
+
+function copyto_unaliased!(deststyle::IndexStyle, dest::AbstractArray, srcstyle::IndexStyle, src::AbstractArray)
+    isempty(src) && return dest
+    length(dest) < length(src) && throw(BoundsError(dest, LinearIndices(src)))
+    _unaliased_copyto!(deststyle, dest, srcstyle, src)
+    return dest
+end
+
+# IndexCartesian and CartesianIndices has not been defined, only implement Linear to Linear here.
+function _unaliased_copyto!(::IndexLinear, dest::AbstractArray, ::IndexLinear, src::AbstractArray)
+    @_inline_meta
+    Δi = firstindex(dest) - firstindex(src)
+    for i in eachindex(src)
+        @inbounds dest[i + Δi] = src[i]
+    end
+end 
+
 function copyto!(dest::AbstractArray, dstart::Integer, src::AbstractArray)
     copyto!(dest, dstart, src, first(LinearIndices(src)), length(src))
 end

diff --git a/base/array.jl b/base/array.jl
@@ -1354,14 +1354,15 @@ See also: [`push!`](@ref), [`replace`](@ref), [`popat!`](@ref), [`splice!`](@ref
 
 # Examples
 ```jldoctest
-julia> insert!([6, 5, 4, 2, 1], 4, 3)
-6-element Vector{Int64}:
- 6
- 5
- 4
- 3
- 2
+julia> insert!(Any[1:6;], 3, "here")
+7-element Vector{Any}:
  1
+ 2
+  "here"
+ 3
+ 4
+ 5
+ 6
 ```
 """
 function insert!(a::Array{T,1}, i::Integer, item) where T

diff --git a/base/deprecated.jl b/base/deprecated.jl
@@ -243,6 +243,12 @@ cat_shape(dims, shape::Tuple{}) = () # make sure `cat_shape(dims, ())` do not re
 @deprecate unsafe_indices(A) axes(A) false
 @deprecate unsafe_length(r) length(r) false
 
+# these were internal type aliases, but some pacakges seem to be relying on them
+const Any16{N} = Tuple{Any,Any,Any,Any,Any,Any,Any,Any,
+                        Any,Any,Any,Any,Any,Any,Any,Any,Vararg{Any,N}}
+const All16{T,N} = Tuple{T,T,T,T,T,T,T,T,
+                         T,T,T,T,T,T,T,T,Vararg{T,N}}
+
 # END 1.6 deprecations
 
 # BEGIN 1.7 deprecations

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
@@ -1100,6 +1100,40 @@ in the range of `Rdest`. The sizes of the two regions must match.
 """
 copyto!(::AbstractArray, ::CartesianIndices, ::AbstractArray, ::CartesianIndices)
 
+# Cartesian to Linear unaliased copy
+function _unaliased_copyto!(::IndexLinear, dest::AbstractArray, ::IndexCartesian, src::AbstractArray)
+    @_inline_meta
+    axs = axes(src)
+    ax, iter = axs[1], CartesianIndices(tail(axs))
+    len, j = length(ax), firstindex(dest)
+    @inbounds for I in iter
+        n = 0
+        while n < len
+            dest[j + n] = src[first(ax) + n, I.I...]
+            n += 1
+        end
+        j += len
+    end
+end
+
+# Linear to Cartesian unaliased copy
+function _unaliased_copyto!(::IndexCartesian, dest::AbstractArray, ::IndexLinear, src::AbstractArray)
+    @_inline_meta
+    axs = axes(dest)
+    ax, iter = axs[1], CartesianIndices(tail(axs))
+    len, i = length(ax), firstindex(src)
+    final = lastindex(src) + 1
+    @inbounds for I in iter
+        len′ = min(final - i, len)
+        n = 0
+        while n < len′
+            dest[first(ax) + n, I.I...] = src[i + n]
+            n += 1
+        end
+        (i += len′) == final && break
+    end
+end
+
 # circshift!
 circshift!(dest::AbstractArray, src, ::Tuple{}) = copyto!(dest, src)
 """

diff --git a/base/operators.jl b/base/operators.jl
@@ -876,8 +876,8 @@ Modulus after flooring division, returning a value `r` such that `mod(r, y) == m
 in the range ``(0, y]`` for positive `y` and in the range ``[y,0)`` for negative `y`.
 
 With integer arguments and positive `y`, this is equal to `mod(x, 1:y)`, and hence natural
-for 1-based indexing. By comparison, `mod(x, y) == mod(x, 0:y-1)` is natural for computations with 
-offsets or strides. 
+for 1-based indexing. By comparison, `mod(x, y) == mod(x, 0:y-1)` is natural for computations with
+offsets or strides.
 
 See also [`mod`](@ref), [`fld1`](@ref), [`fldmod1`](@ref).
 

diff --git a/base/range.jl b/base/range.jl
@@ -465,17 +465,17 @@ julia> LinRange(1.5, 5.5, 9)
 
 Compared to using [`range`](@ref), directly constructing a `LinRange` should
 have less overhead but won't try to correct for floating point errors:
-```julia
+```jldoctest
 julia> collect(range(-0.1, 0.3, length=5))
-5-element Array{Float64,1}:
+5-element Vector{Float64}:
  -0.1
   0.0
   0.1
   0.2
   0.3
 
 julia> collect(LinRange(-0.1, 0.3, 5))
-5-element Array{Float64,1}:
+5-element Vector{Float64}:
  -0.1
  -1.3877787807814457e-17
   0.09999999999999999

diff --git a/base/tuple.jl b/base/tuple.jl
@@ -55,11 +55,10 @@ function setindex(x::Tuple, v, i::Integer)
     _setindex(v, i, x...)
 end
 
-function _setindex(v, i::Integer, first, tail...)
+function _setindex(v, i::Integer, args...)
     @_inline_meta
-    return (ifelse(i == 1, v, first), _setindex(v, i - 1, tail...)...)
+    return ntuple(j -> ifelse(j == i, v, args[j]), length(args))
 end
-_setindex(v, i::Integer) = ()
 
 
 ## iterating ##

diff --git a/doc/build/build.md b/doc/build/build.md
@@ -250,21 +250,6 @@ Julia uses a custom fork of libuv. It is a small dependency, and can be safely b
 
 As a high-performance numerical language, Julia should be linked to a multi-threaded BLAS and LAPACK, such as OpenBLAS or ATLAS, which will provide much better performance than the reference `libblas` implementations which may be default on some systems.
 
-### Intel MKL
-
-**Note:** If you are building Julia for the sole purpose of incorporating Intel MKL, it may be beneficial to first try [MKL.jl](https://github.com/JuliaComputing/MKL.jl). This package will automatically download MKL and rebuild Julia's system image against it, sidestepping the need to set up a working build environment just to add MKL functionality. MKL.jl replaces OpenBLAS with MKL for dense linear algebra functions called directly from Julia, but SuiteSparse and other C/Fortran libraries will continue to use the BLAS they were linked against at build time. If you want SuiteSparse to use MKL, you will need to build from source.
-
-For a 64-bit architecture, the environment should be set up as follows:
-```sh
-# bash
-source /path/to/intel/bin/compilervars.sh intel64
-```
-Add the following to the `Make.user` file:
-
-    USE_INTEL_MKL = 1
-
-It is highly recommended to start with a fresh clone of the Julia repository.
-
 ## Source distributions of releases
 
 Each pre-release and release of Julia has a "full" source distribution and a "light" source

diff --git a/src/cgutils.cpp b/src/cgutils.cpp
@@ -1479,12 +1479,19 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
     if (type_is_ghost(elty))
         return ghostValue(jltype);
     AllocaInst *intcast = NULL;
-    if (!isboxed && Order != AtomicOrdering::NotAtomic && !elty->isIntOrPtrTy() && !elty->isFloatingPointTy()) {
+    if (!isboxed && Order != AtomicOrdering::NotAtomic && !elty->isIntOrPtrTy()) {
         const DataLayout &DL = jl_data_layout;
         unsigned nb = DL.getTypeSizeInBits(elty);
         intcast = ctx.builder.CreateAlloca(elty);
         elty = Type::getIntNTy(jl_LLVMContext, nb);
     }
+    Type *realelty = elty;
+    if (Order != AtomicOrdering::NotAtomic && isa<IntegerType>(elty)) {
+        unsigned nb = cast<IntegerType>(elty)->getBitWidth();
+        unsigned nb2 = PowerOf2Ceil(nb);
+        if (nb != nb2)
+            elty = Type::getIntNTy(jl_LLVMContext, nb2);
+    }
     Type *ptrty = PointerType::get(elty, ptr->getType()->getPointerAddressSpace());
     Value *data;
     if (ptr->getType() != ptrty)
@@ -1493,7 +1500,7 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
         data = ptr;
     if (idx_0based)
         data = ctx.builder.CreateInBoundsGEP(elty, data, idx_0based);
-    Instruction *load;
+    Value *instr;
     // TODO: can only lazy load if we can create a gc root for ptr for the lifetime of elt
     //if (elty->isAggregateType() && tbaa == tbaa_immut && !alignment) { // can lazy load on demand, no copy needed
     //    elt = data;
@@ -1503,20 +1510,23 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
             alignment = sizeof(void*);
         else if (!alignment)
             alignment = julia_alignment(jltype);
-        load = ctx.builder.CreateAlignedLoad(data, Align(alignment), false);
-        cast<LoadInst>(load)->setOrdering(Order);
+        LoadInst *load = ctx.builder.CreateAlignedLoad(data, Align(alignment), false);
+        load->setOrdering(Order);
         if (aliasscope)
             load->setMetadata("alias.scope", aliasscope);
         if (isboxed)
-            load = maybe_mark_load_dereferenceable(load, true, jltype);
+            maybe_mark_load_dereferenceable(load, true, jltype);
         if (tbaa)
-            load = tbaa_decorate(tbaa, load);
+            tbaa_decorate(tbaa, load);
+        instr = load;
+        if (elty != realelty)
+            instr = ctx.builder.CreateTrunc(instr, realelty);
         if (intcast) {
-            ctx.builder.CreateStore(load, ctx.builder.CreateBitCast(intcast, load->getType()->getPointerTo()));
-            load = ctx.builder.CreateLoad(intcast);
+            ctx.builder.CreateStore(instr, ctx.builder.CreateBitCast(intcast, instr->getType()->getPointerTo()));
+            instr = ctx.builder.CreateLoad(intcast);
         }
         if (maybe_null_if_boxed) {
-            Value *first_ptr = isboxed ? load : extract_first_ptr(ctx, load);
+            Value *first_ptr = isboxed ? instr : extract_first_ptr(ctx, instr);
             if (first_ptr)
                 null_pointer_check(ctx, first_ptr, nullcheck);
         }
@@ -1526,9 +1536,9 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
         //load->setMetadata(LLVMContext::MD_range, MDNode::get(jl_LLVMContext, {
         //    ConstantAsMetadata::get(ConstantInt::get(T_int8, 0)),
         //    ConstantAsMetadata::get(ConstantInt::get(T_int8, 2)) }));
-        load = ctx.builder.Insert(CastInst::Create(Instruction::Trunc, load, T_int1));
+        instr = ctx.builder.CreateTrunc(instr, T_int1);
     }
-    return mark_julia_type(ctx, load, isboxed, jltype);
+    return mark_julia_type(ctx, instr, isboxed, jltype);
 }
 
 static jl_cgval_t typed_store(jl_codectx_t &ctx,
@@ -1544,18 +1554,27 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
     if (type_is_ghost(elty))
         return oldval;
     Value *intcast = nullptr;
-    if (!isboxed && Order != AtomicOrdering::NotAtomic && !elty->isIntOrPtrTy() && !elty->isFloatingPointTy()) {
+    if (!isboxed && Order != AtomicOrdering::NotAtomic && !elty->isIntOrPtrTy()) {
         const DataLayout &DL = jl_data_layout;
         unsigned nb = DL.getTypeSizeInBits(elty);
         if (!issetfield)
             intcast = ctx.builder.CreateAlloca(elty);
         elty = Type::getIntNTy(jl_LLVMContext, nb);
     }
+    Type *realelty = elty;
+    if (Order != AtomicOrdering::NotAtomic && isa<IntegerType>(elty)) {
+        unsigned nb = cast<IntegerType>(elty)->getBitWidth();
+        unsigned nb2 = PowerOf2Ceil(nb);
+        if (nb != nb2)
+            elty = Type::getIntNTy(jl_LLVMContext, nb2);
+    }
     Value *r;
     if (!isboxed)
-        r = emit_unbox(ctx, elty, rhs, jltype);
+        r = emit_unbox(ctx, realelty, rhs, jltype);
     else
         r = boxed(ctx, rhs);
+    if (realelty != elty)
+        r = ctx.builder.CreateZExt(r, elty);
     Type *ptrty = PointerType::get(elty, ptr->getType()->getPointerAddressSpace());
     if (ptr->getType() != ptrty)
         ptr = ctx.builder.CreateBitCast(ptr, ptrty);
@@ -1578,18 +1597,19 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                 instr->setMetadata("noalias", aliasscope);
             if (tbaa)
                 tbaa_decorate(tbaa, instr);
-        }
-        if (isreplacefield) {
-            oldval = mark_julia_type(ctx, instr, isboxed, jltype);
-            Value *first_ptr = nullptr;
-            if (maybe_null_if_boxed)
-                first_ptr = isboxed ? instr : extract_first_ptr(ctx, instr);
-            Success = emit_nullcheck_guard(ctx, first_ptr, [&] {
-                return emit_f_is(ctx, oldval, cmp);
-            });
-            BasicBlock *BB = BasicBlock::Create(jl_LLVMContext, "xchg", ctx.f);
-            ctx.builder.CreateCondBr(Success, BB, DoneBB);
-            ctx.builder.SetInsertPoint(BB);
+            assert(realelty == elty);
+            if (isreplacefield) {
+                oldval = mark_julia_type(ctx, instr, isboxed, jltype);
+                Value *first_ptr = nullptr;
+                if (maybe_null_if_boxed)
+                    first_ptr = isboxed ? instr : extract_first_ptr(ctx, instr);
+                Success = emit_nullcheck_guard(ctx, first_ptr, [&] {
+                    return emit_f_is(ctx, oldval, cmp);
+                });
+                BasicBlock *BB = BasicBlock::Create(jl_LLVMContext, "xchg", ctx.f);
+                ctx.builder.CreateCondBr(Success, BB, DoneBB);
+                ctx.builder.SetInsertPoint(BB);
+            }
         }
         StoreInst *store = ctx.builder.CreateAlignedStore(r, ptr, Align(alignment));
         store->setOrdering(Order);
@@ -1628,7 +1648,9 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                     Current->addIncoming(instr, SkipBB);
                     ctx.builder.SetInsertPoint(BB);
                 }
-                Compare = emit_unbox(ctx, elty, cmp, jltype);
+                Compare = emit_unbox(ctx, realelty, cmp, jltype);
+                if (realelty != elty)
+                    Compare = ctx.builder.CreateZExt(Compare, elty);
             }
             else if (cmp.isboxed) {
                 Compare = boxed(ctx, cmp);
@@ -1676,21 +1698,26 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         if (tbaa)
             tbaa_decorate(tbaa, store);
         instr = ctx.builder.Insert(ExtractValueInst::Create(store, 0));
-        Success = ctx.builder.CreateExtractValue(store, 1);
+        Success = ctx.builder.Insert(ExtractValueInst::Create(store, 1));
         Value *Done = Success;
         if (needloop) {
             if (isreplacefield) {
+                Value *realinstr = instr;
+                if (realelty != elty)
+                    realinstr = ctx.builder.CreateTrunc(instr, realelty);
                 if (intcast) {
-                    ctx.builder.CreateStore(instr, ctx.builder.CreateBitCast(intcast, instr->getType()->getPointerTo()));
+                    ctx.builder.CreateStore(realinstr, ctx.builder.CreateBitCast(intcast, realinstr->getType()->getPointerTo()));
                     oldval = mark_julia_slot(intcast, jltype, NULL, tbaa_stack);
+                    if (maybe_null_if_boxed)
+                        realinstr = ctx.builder.CreateLoad(intcast);
                 }
                 else {
-                    oldval = mark_julia_type(ctx, instr, isboxed, jltype);
+                    oldval = mark_julia_type(ctx, realinstr, isboxed, jltype);
                 }
                 Done = emit_guarded_test(ctx, ctx.builder.CreateNot(Success), false, [&] {
                     Value *first_ptr = nullptr;
                     if (maybe_null_if_boxed)
-                        first_ptr = isboxed ? instr : extract_first_ptr(ctx, instr);
+                        first_ptr = isboxed ? realinstr : extract_first_ptr(ctx, realinstr);
                     return emit_nullcheck_guard(ctx, first_ptr, [&] {
                         return emit_f_is(ctx, oldval, cmp);
                     });
@@ -1747,6 +1774,8 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         }
     }
     if (!issetfield) {
+        if (realelty != elty)
+            instr = ctx.builder.Insert(CastInst::Create(Instruction::Trunc, instr, realelty));
         if (intcast) {
             ctx.builder.CreateStore(instr, ctx.builder.CreateBitCast(intcast, instr->getType()->getPointerTo()));
             instr = ctx.builder.CreateLoad(intcast);
@@ -2053,6 +2082,9 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
         emit_atomic_error(ctx, "getfield: atomic field cannot be accessed non-atomically");
         return jl_cgval_t(); // unreachable
     }
+    if (order == jl_memory_order_unspecified) {
+        order = isatomic ? jl_memory_order_unordered : jl_memory_order_notatomic;
+    }
     if (jfty == jl_bottom_type) {
         raise_exception(ctx, literal_pointer_val(ctx, jl_undefref_exception));
         return jl_cgval_t(); // unreachable
@@ -2126,7 +2158,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st
         if (needlock)
             emit_lockstate_value(ctx, strct, true);
         jl_cgval_t ret = typed_load(ctx, addr, NULL, jfty, tbaa, nullptr, false,
-                needlock || order <= jl_memory_order_notatomic ? AtomicOrdering::NotAtomic : get_llvm_atomic_order(order), // TODO: we should use unordered for anything with CountTrackedPointers(elty).count > 0
+                needlock ? AtomicOrdering::NotAtomic : get_llvm_atomic_order(order), // TODO: we should use unordered for anything with CountTrackedPointers(elty).count > 0
                 maybe_null, align, nullcheck);
         if (needlock)
             emit_lockstate_value(ctx, strct, false);

diff --git a/src/datatype.c b/src/datatype.c
@@ -730,7 +730,53 @@ JL_DLLEXPORT int jl_is_foreign_type(jl_datatype_t *dt)
 #error MAX_POINTERATOMIC_SIZE too large
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
+#ifndef _P64
+#error 12 byte GC pool size not implemented for 32-bit
+#endif
 typedef __uint128_t uint128_t;
+typedef uint128_t jl_uatomicmax_t;
+#else
+typedef uint64_t jl_uatomicmax_t;
+#endif
+
+#if BYTE_ORDER != LITTLE_ENDIAN
+#error using masks for atomics (instead of memcpy like nb == 16) assumes little endian
+#endif
+
+static inline uint32_t zext_read32(const jl_value_t *x, size_t nb) JL_NOTSAFEPOINT
+{
+    uint32_t y = *(uint32_t*)x;
+    if (nb == 4)
+        return y;
+    else // if (nb == 3)
+        return 0xffffffu & y;
+}
+
+#if MAX_POINTERATOMIC_SIZE >= 8
+static inline uint64_t zext_read64(const jl_value_t *x, size_t nb) JL_NOTSAFEPOINT
+{
+    uint64_t y = *(uint64_t*)x;
+    if (nb == 8)
+        return y;
+    else if (nb == 7)
+        return 0xffffffffffffffu & y;
+    else if (nb == 6)
+        return 0xffffffffffffu & y;
+    else // if (nb == 5)
+        return 0xffffffffffu & y;
+}
+#endif
+
+#if MAX_POINTERATOMIC_SIZE >= 16
+static inline uint128_t zext_read128(const jl_value_t *x, size_t nb) JL_NOTSAFEPOINT
+{
+    uint128_t y = 0;
+    if (nb == 16)
+        y = *(uint128_t*)x;
+    else
+        memcpy(&y, x, nb);
+    return y;
+}
 #endif
 
 JL_DLLEXPORT jl_value_t *jl_new_bits(jl_value_t *dt, const void *data)
@@ -755,16 +801,7 @@ JL_DLLEXPORT jl_value_t *jl_new_bits(jl_value_t *dt, const void *data)
 
     jl_task_t *ct = jl_current_task;
     jl_value_t *v = jl_gc_alloc(ct->ptls, nb, bt);
-    switch (nb) {
-    case  1: *(uint8_t*) v = *(uint8_t*)data;    break;
-    case  2: *(uint16_t*)v = jl_load_unaligned_i16(data);   break;
-    case  4: *(uint32_t*)v = jl_load_unaligned_i32(data);   break;
-    case  8: *(uint64_t*)v = jl_load_unaligned_i64(data);   break;
-    case 16:
-        memcpy(jl_assume_aligned(v, 16), data, 16);
-        break;
-    default: memcpy(v, data, nb);
-    }
+    memcpy(jl_assume_aligned(v, sizeof(void*)), data, nb);
     return v;
 }
 
@@ -789,40 +826,51 @@ JL_DLLEXPORT jl_value_t *jl_atomic_new_bits(jl_value_t *dt, const char *data)
 
     jl_task_t *ct = jl_current_task;
     jl_value_t *v = jl_gc_alloc(ct->ptls, nb, bt);
-    switch (nb) {
-    case  1: *(uint8_t*) v = jl_atomic_load((uint8_t*)data);    break;
-    case  2: *(uint16_t*)v = jl_atomic_load((uint16_t*)data);   break;
-    case  4: *(uint32_t*)v = jl_atomic_load((uint32_t*)data);   break;
+    // data is aligned to the power of two,
+    // we will write too much of v, but the padding should exist
+    if (nb == 1)
+        *(uint8_t*) v = jl_atomic_load((uint8_t*)data);
+    else if (nb <= 2)
+        *(uint16_t*)v = jl_atomic_load((uint16_t*)data);
+    else if (nb <= 4)
+        *(uint32_t*)v = jl_atomic_load((uint32_t*)data);
 #if MAX_POINTERATOMIC_SIZE >= 8
-    case  8: *(uint64_t*)v = jl_atomic_load((uint64_t*)data);   break;
+    else if (nb <= 8)
+        *(uint64_t*)v = jl_atomic_load((uint64_t*)data);
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
-    case 16: *(uint128_t*)v = jl_atomic_load((uint128_t*)data);        break;
+    else if (nb <= 16)
+        *(uint128_t*)v = jl_atomic_load((uint128_t*)data);
 #endif
-    default:
+    else
         abort();
-    }
     return v;
 }
 
 JL_DLLEXPORT void jl_atomic_store_bits(char *dst, const jl_value_t *src, int nb)
 {
     // dst must have the required alignment for an atomic of the given size
     // src must be aligned by the GC
-    switch (nb) {
-    case  0:                                                   break;
-    case  1: jl_atomic_store((uint8_t*)dst, *(uint8_t*)src);   break;
-    case  2: jl_atomic_store((uint16_t*)dst, *(uint16_t*)src); break;
-    case  4: jl_atomic_store((uint32_t*)dst, *(uint32_t*)src); break;
+    // we may therefore read too much from src, but will zero the excess bits
+    // before the store (so that we can get faster cmpswap later)
+    if (nb == 0)
+        ;
+    else if (nb == 1)
+        jl_atomic_store((uint8_t*)dst, *(uint8_t*)src);
+    else if (nb == 2)
+        jl_atomic_store((uint16_t*)dst, *(uint16_t*)src);
+    else if (nb <= 4)
+        jl_atomic_store((uint32_t*)dst, zext_read32(src, nb));
 #if MAX_POINTERATOMIC_SIZE >= 8
-    case  8: jl_atomic_store((uint64_t*)dst, *(uint64_t*)src); break;
+    else if (nb <= 8)
+        jl_atomic_store((uint64_t*)dst, zext_read64(src, nb));
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
-    case 16: jl_atomic_store((uint128_t*)dst, *(uint128_t*)src); break;
+    else if (nb <= 16)
+        jl_atomic_store((uint128_t*)dst, zext_read128(src, nb));
 #endif
-    default:
+    else
         abort();
-    }
 }
 
 JL_DLLEXPORT jl_value_t *jl_atomic_swap_bits(jl_value_t *dt, char *dst, const jl_value_t *src, int nb)
@@ -845,19 +893,22 @@ JL_DLLEXPORT jl_value_t *jl_atomic_swap_bits(jl_value_t *dt, char *dst, const jl
 
     jl_task_t *ct = jl_current_task;
     jl_value_t *v = jl_gc_alloc(ct->ptls, jl_datatype_size(bt), bt);
-    switch (nb) {
-    case  1: *(uint8_t*) v = jl_atomic_exchange((uint8_t*)dst, *(uint8_t*)src);    break;
-    case  2: *(uint16_t*)v = jl_atomic_exchange((uint16_t*)dst, *(uint16_t*)src);   break;
-    case  4: *(uint32_t*)v = jl_atomic_exchange((uint32_t*)dst, *(uint32_t*)src);   break;
+    if (nb == 1)
+        *(uint8_t*)v = jl_atomic_exchange((uint8_t*)dst, *(uint8_t*)src);
+    else if (nb == 2)
+        *(uint16_t*)v = jl_atomic_exchange((uint16_t*)dst, *(uint16_t*)src);
+    else if (nb <= 4)
+        *(uint32_t*)v = jl_atomic_exchange((uint32_t*)dst, zext_read32(src, nb));
 #if MAX_POINTERATOMIC_SIZE >= 8
-    case  8: *(uint64_t*)v = jl_atomic_exchange((uint64_t*)dst, *(uint64_t*)src);   break;
+    else if (nb <= 8)
+        *(uint64_t*)v = jl_atomic_exchange((uint64_t*)dst, zext_read64(src, nb));
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
-    case 16: *(uint128_t*)v = jl_atomic_exchange((uint128_t*)dst, *(uint128_t*)src);   break;
+    else if (nb <= 16)
+        *(uint128_t*)v = jl_atomic_exchange((uint128_t*)dst, zext_read128(src, nb));
 #endif
-    default:
+    else
         abort();
-    }
     return v;
 }
 
@@ -866,41 +917,37 @@ JL_DLLEXPORT int jl_atomic_bool_cmpswap_bits(char *dst, const jl_value_t *expect
     // dst must have the required alignment for an atomic of the given size
     // n.b.: this can spuriously fail if there are padding bits, the caller should deal with that
     int success;
-    switch (nb) {
-    case  0: {
+    if (nb == 0) {
         success = 1;
-        break;
     }
-    case  1: {
+    else if (nb == 1) {
         uint8_t y = *(uint8_t*)expected;
         success = jl_atomic_cmpswap((uint8_t*)dst, &y, *(uint8_t*)src);
-        break;
     }
-    case  2: {
+    else if (nb == 2) {
         uint16_t y = *(uint16_t*)expected;
         success = jl_atomic_cmpswap((uint16_t*)dst, &y, *(uint16_t*)src);
-        break;
     }
-    case  4: {
-        uint32_t y = *(uint32_t*)expected;
-        success = jl_atomic_cmpswap((uint32_t*)dst, &y, *(uint32_t*)src);
-        break;
+    else if (nb <= 4) {
+        uint32_t y = zext_read32(expected, nb);
+        uint32_t z = zext_read32(src, nb);
+        success = jl_atomic_cmpswap((uint32_t*)dst, &y, z);
     }
 #if MAX_POINTERATOMIC_SIZE >= 8
-    case  8: {
-        uint64_t y = *(uint64_t*)expected;
-        success = jl_atomic_cmpswap((uint64_t*)dst, &y, *(uint64_t*)src);
-        break;
+    else if (nb <= 8) {
+        uint64_t y = zext_read64(expected, nb);
+        uint64_t z = zext_read64(src, nb);
+        success = jl_atomic_cmpswap((uint64_t*)dst, &y, z);
     }
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
-    case 16: {
-        uint128_t y = *(uint128_t*)expected;
-        success = jl_atomic_cmpswap((uint128_t*)dst, &y, *(uint128_t*)src);
-        break;
+    else if (nb <= 16) {
+        uint128_t y = zext_read128(expected, nb);
+        uint128_t z = zext_read128(src, nb);
+        success = jl_atomic_cmpswap((uint128_t*)dst, &y, z);
     }
 #endif
-    default:
+    else {
         abort();
     }
     return success;
@@ -920,45 +967,42 @@ JL_DLLEXPORT jl_value_t *jl_atomic_cmpswap_bits(jl_datatype_t *dt, char *dst, co
     jl_value_t *y = jl_gc_alloc(ct->ptls, isptr ? nb : tuptyp->size, isptr ? dt : tuptyp);
     int success;
     jl_datatype_t *et = (jl_datatype_t*)jl_typeof(expected);
-    switch (nb) {
-    case  0: {
+    if (nb == 0) {
         success = (dt == et);
-        break;
     }
-    case  1: {
+    else if (nb == 1) {
         uint8_t *y8 = (uint8_t*)y;
+        assert(!dt->layout->haspadding);
         if (dt == et) {
             *y8 = *(uint8_t*)expected;
-            success = jl_atomic_cmpswap((uint8_t*)dst, y8, *(uint8_t*)src);
+            uint8_t z8 = *(uint8_t*)src;
+            success = jl_atomic_cmpswap((uint8_t*)dst, y8, z8);
         }
         else {
             *y8 = jl_atomic_load((uint8_t*)dst);
             success = 0;
         }
-        break;
     }
-    case  2: {
+    else if (nb == 2) {
         uint16_t *y16 = (uint16_t*)y;
+        assert(!dt->layout->haspadding);
         if (dt == et) {
             *y16 = *(uint16_t*)expected;
-            while (1) {
-                success = jl_atomic_cmpswap((uint16_t*)dst, y16, *(uint16_t*)src);
-                if (success || !dt->layout->haspadding || !jl_egal__bits(y, expected, dt))
-                    break;
-            }
+            uint16_t z16 = *(uint16_t*)src;
+            success = jl_atomic_cmpswap((uint16_t*)dst, y16, z16);
         }
         else {
             *y16 = jl_atomic_load((uint16_t*)dst);
             success = 0;
         }
-        break;
     }
-    case  4: {
+    else if (nb <= 4) {
         uint32_t *y32 = (uint32_t*)y;
         if (dt == et) {
-            *y32 = *(uint32_t*)expected;
+            *y32 = zext_read32(expected, nb);
+            uint32_t z32 = zext_read32(src, nb);
             while (1) {
-                success = jl_atomic_cmpswap((uint32_t*)dst, y32, *(uint32_t*)src);
+                success = jl_atomic_cmpswap((uint32_t*)dst, y32, z32);
                 if (success || !dt->layout->haspadding || !jl_egal__bits(y, expected, dt))
                     break;
             }
@@ -967,15 +1011,15 @@ JL_DLLEXPORT jl_value_t *jl_atomic_cmpswap_bits(jl_datatype_t *dt, char *dst, co
             *y32 = jl_atomic_load((uint32_t*)dst);
             success = 0;
         }
-        break;
     }
 #if MAX_POINTERATOMIC_SIZE >= 8
-    case  8: {
+    else if (nb <= 8) {
         uint64_t *y64 = (uint64_t*)y;
         if (dt == et) {
-            *y64 = *(uint64_t*)expected;
+            *y64 = zext_read64(expected, nb);
+            uint64_t z64 = zext_read64(src, nb);
             while (1) {
-                success = jl_atomic_cmpswap((uint64_t*)dst, y64, *(uint64_t*)src);
+                success = jl_atomic_cmpswap((uint64_t*)dst, y64, z64);
                 if (success || !dt->layout->haspadding || !jl_egal__bits(y, expected, dt))
                     break;
             }
@@ -984,16 +1028,16 @@ JL_DLLEXPORT jl_value_t *jl_atomic_cmpswap_bits(jl_datatype_t *dt, char *dst, co
             *y64 = jl_atomic_load((uint64_t*)dst);
             success = 0;
         }
-        break;
     }
 #endif
 #if MAX_POINTERATOMIC_SIZE >= 16
-    case 16: {
+    else if (nb <= 16) {
         uint128_t *y128 = (uint128_t*)y;
         if (dt == et) {
-            *y128 = *(uint128_t*)expected;
+            *y128 = zext_read128(expected, nb);
+            uint128_t z128 = zext_read128(src, nb);
             while (1) {
-                success = jl_atomic_cmpswap((uint128_t*)dst, y128, *(uint128_t*)src);
+                success = jl_atomic_cmpswap((uint128_t*)dst, y128, z128);
                 if (success || !dt->layout->haspadding || !jl_egal__bits(y, expected, dt))
                     break;
             }
@@ -1002,10 +1046,9 @@ JL_DLLEXPORT jl_value_t *jl_atomic_cmpswap_bits(jl_datatype_t *dt, char *dst, co
             *y128 = jl_atomic_load((uint128_t*)dst);
             success = 0;
         }
-        break;
     }
 #endif
-    default:
+    else {
         abort();
     }
     if (isptr) {
@@ -1417,16 +1460,12 @@ static inline void memassign_safe(int hasptr, jl_value_t *parent, char *dst, con
     else {
         // src must be a heap box.
         assert(nb == jl_datatype_size(jl_typeof(src)));
+        if (nb >= 16) {
+            memcpy(dst, jl_assume_aligned(src, 16), nb);
+            return;
+        }
     }
-    switch (nb) {
-    case  0:                                               break;
-    case  1: *(uint8_t*)dst            = *(uint8_t*)src;   break;
-    case  2: jl_store_unaligned_i16(dst, *(uint16_t*)src); break;
-    case  4: jl_store_unaligned_i32(dst, *(uint32_t*)src); break;
-    case  8: jl_store_unaligned_i64(dst, *(uint64_t*)src); break;
-    case 16: memcpy(dst, jl_assume_aligned(src, 16), 16);  break;
-    default: memcpy(dst, src, nb);                         break;
-    }
+    memcpy(dst, jl_assume_aligned(src, sizeof(void*)), nb);
 }
 
 void set_nth_field(jl_datatype_t *st, jl_value_t *v, size_t i, jl_value_t *rhs, int isatomic) JL_NOTSAFEPOINT

diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
@@ -149,6 +149,7 @@
     XX(jl_expand_with_loc) \
     XX(jl_expand_with_loc_warn) \
     XX(jl_extern_c) \
+    XX(jl_field_index) \
     XX(jl_gc_add_finalizer) \
     XX(jl_gc_add_finalizer_th) \
     XX(jl_gc_add_ptr_finalizer) \

diff --git a/src/processor.cpp b/src/processor.cpp
@@ -401,6 +401,8 @@ static inline std::vector<uint8_t> serialize_target_data(llvm::StringRef name,
 {
     std::vector<uint8_t> res;
     auto add_data = [&] (const void *data, size_t sz) {
+        if (sz == 0)
+            return;
         size_t old_sz = res.size();
         res.resize(old_sz + sz);
         memcpy(&res[old_sz], data, sz);

diff --git a/test/atomics.jl b/test/atomics.jl
@@ -63,34 +63,59 @@ let (x, y) = (Complex{Int128}(10, 30), Complex{Int128}(20, 40))
     @test sizeof(r) == sizeof(ar) - Int(fieldoffset(typeof(ar), 1))
 end
 
+struct PadIntA <: Number # internal padding
+    a::Int8
+    b::Int16
+    PadIntA(x) = new(82, x)
+end
+struct PadIntB <: Number # external padding
+    a::UInt8
+    b::UInt8
+    c::UInt8
+    PadIntB(x) = new(x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff)
+end
+primitive type Int24 <: Signed 24 end # integral padding
+Int24(x::Int) = Core.Intrinsics.trunc_int(Int24, x)
+Base.Int(x::PadIntB) = x.a + (Int(x.b) << 8) + (Int(x.c) << 16)
+Base.:(+)(x::PadIntA, b::Int) = PadIntA(x.b + b)
+Base.:(+)(x::PadIntB, b::Int) = PadIntB(Int(x) + b)
+Base.:(+)(x::Int24, b::Int) = Core.Intrinsics.add_int(x, Int24(b))
+Base.show(io::IO, x::PadIntA) = print(io, "PadIntA(", x.b, ")")
+Base.show(io::IO, x::PadIntB) = print(io, "PadIntB(", Int(x), ")")
+Base.show(io::IO, x::Int24) = print(io, "Int24(", Core.Intrinsics.zext_int(Int, x), ")")
+
 @noinline function _test_field_operators(r)
     r = r[]
     T = typeof(getfield(r, :x))
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_10)
-    @test setfield!(r, :x, T(12345_1), :sequentially_consistent) === T(12345_1)
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_1)
-    @test replacefield!(r, :x, 12345_1 % UInt, T(12345_100), :sequentially_consistent, :sequentially_consistent) === (T(12345_1), false)
-    @test replacefield!(r, :x, T(12345_1), T(12345_100), :sequentially_consistent, :sequentially_consistent) === (T(12345_1), true)
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_100)
-    @test replacefield!(r, :x, T(12345_1), T(12345_1), :sequentially_consistent, :sequentially_consistent) === (T(12345_100), false)
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_100)
-    @test modifyfield!(r, :x, add, 1, :sequentially_consistent) === (T(12345_100), T(12345_101))
-    @test modifyfield!(r, :x, add, 1, :sequentially_consistent) === (T(12345_101), T(12345_102))
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_102)
-    @test swapfield!(r, :x, T(12345_1), :sequentially_consistent) === T(12345_102)
-    @test getfield(r, :x, :sequentially_consistent) === T(12345_1)
+    @test getfield(r, :x, :sequentially_consistent) === T(123_10)
+    @test setfield!(r, :x, T(123_1), :sequentially_consistent) === T(123_1)
+    @test getfield(r, :x, :sequentially_consistent) === T(123_1)
+    @test replacefield!(r, :x, 123_1 % UInt, T(123_30), :sequentially_consistent, :sequentially_consistent) === (T(123_1), false)
+    @test replacefield!(r, :x, T(123_1), T(123_30), :sequentially_consistent, :sequentially_consistent) === (T(123_1), true)
+    @test getfield(r, :x, :sequentially_consistent) === T(123_30)
+    @test replacefield!(r, :x, T(123_1), T(123_1), :sequentially_consistent, :sequentially_consistent) === (T(123_30), false)
+    @test getfield(r, :x, :sequentially_consistent) === T(123_30)
+    @test modifyfield!(r, :x, add, 1, :sequentially_consistent) === (T(123_30), T(123_31))
+    @test modifyfield!(r, :x, add, 1, :sequentially_consistent) === (T(123_31), T(123_32))
+    @test getfield(r, :x, :sequentially_consistent) === T(123_32)
+    @test swapfield!(r, :x, T(123_1), :sequentially_consistent) === T(123_32)
+    @test getfield(r, :x, :sequentially_consistent) === T(123_1)
     nothing
 end
 @noinline function test_field_operators(r)
     _test_field_operators(Ref(copy(r)))
     _test_field_operators(Ref{Any}(copy(r)))
     nothing
 end
-test_field_operators(ARefxy{Int}(12345_10, 12345_20))
-test_field_operators(ARefxy{Any}(12345_10, 12345_20))
-test_field_operators(ARefxy{Union{Nothing,Int}}(12345_10, nothing))
-test_field_operators(ARefxy{Complex{Int32}}(12345_10, 12345_20))
-test_field_operators(ARefxy{Complex{Int128}}(12345_10, 12345_20))
+test_field_operators(ARefxy{Int}(123_10, 123_20))
+test_field_operators(ARefxy{Any}(123_10, 123_20))
+test_field_operators(ARefxy{Union{Nothing,Int}}(123_10, nothing))
+test_field_operators(ARefxy{Complex{Int32}}(123_10, 123_20))
+test_field_operators(ARefxy{Complex{Int128}}(123_10, 123_20))
+test_field_operators(ARefxy{PadIntA}(123_10, 123_20))
+test_field_operators(ARefxy{PadIntB}(123_10, 123_20))
+#FIXME: test_field_operators(ARefxy{Int24}(123_10, 123_20))
+test_field_operators(ARefxy{Float64}(123_10, 123_20))
 
 @noinline function _test_field_orderings(r, x, y)
     @nospecialize x y
@@ -247,11 +272,13 @@ test_field_orderings(true, false)
 test_field_orderings("hi", "bye")
 test_field_orderings(:hi, :bye)
 test_field_orderings(nothing, nothing)
-test_field_orderings(ARefxy{Any}(12345_10, 12345_20), 12345_10, 12345_20)
+test_field_orderings(ARefxy{Any}(123_10, 123_20), 123_10, 123_20)
 test_field_orderings(ARefxy{Any}(true, false), true, false)
 test_field_orderings(ARefxy{Union{Nothing,Missing}}(nothing, missing), nothing, missing)
-test_field_orderings(ARefxy{Union{Nothing,Int}}(nothing, 12345_1), nothing, 12345_1)
+test_field_orderings(ARefxy{Union{Nothing,Int}}(nothing, 123_1), nothing, 123_1)
 test_field_orderings(Complex{Int128}(10, 30), Complex{Int128}(20, 40))
+test_field_orderings(10.0, 20.0)
+test_field_orderings(NaN, Inf)
 
 struct UndefComplex{T}
     re::T

diff --git a/test/mpfr.jl b/test/mpfr.jl
@@ -606,7 +606,8 @@ end
         @test log(x) == log(42)
         @test isinf(log(BigFloat(0)))
         @test_throws DomainError log(BigFloat(-1))
-        @test log2(x) == log2(42)
+        # issue #41450
+        @test_skip log2(x) == log2(42)
         @test isinf(log2(BigFloat(0)))
         @test_throws DomainError log2(BigFloat(-1))
         @test log10(x) == log10(42)

diff --git a/test/tuple.jl b/test/tuple.jl
@@ -533,6 +533,9 @@ end
 
     @test Base.setindex((1, 2, 4), 4, true) === (4, 2, 4)
     @test_throws BoundsError Base.setindex((1, 2), 2, false)
+
+    f() = Base.setindex((1:1, 2:2, 3:3), 9, 1)
+    @test @inferred(f()) == (9, 2:2, 3:3)
 end
 
 @testset "inferrable range indexing with constant values" begin