Skip to content

Commit d324f57

Browse files
Xu KuohaiNobody
Xu Kuohai
authored and
Nobody
committed
bpf, arm64: Optimize BPF store/load using arm64 str/ldr(immediate offset)
The current BPF store/load instruction is translated by the JIT into two instructions. The first instruction moves the immediate offset into a temporary register. The second instruction uses this temporary register to do the real store/load. In fact, arm64 supports addressing with immediate offsets. So This patch introduces optimization that uses arm64 str/ldr instruction with immediate offset when the offset fits. Example of generated instuction for r2 = *(u64 *)(r1 + 0): without optimization: mov x10, 0 ldr x1, [x0, x10] with optimization: ldr x1, [x0, 0] If the offset is negative, or is not aligned correctly, or exceeds max value, rollback to the use of temporary register. Signed-off-by: Xu Kuohai <[email protected]>
1 parent 64fdb53 commit d324f57

File tree

2 files changed

+127
-15
lines changed

2 files changed

+127
-15
lines changed

arch/arm64/net/bpf_jit.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,20 @@
6666
#define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
6767
#define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
6868

69+
/* Load/store register (immediate offset) */
70+
#define A64_LS_IMM(Rt, Rn, imm, size, type) \
71+
aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \
72+
AARCH64_INSN_SIZE_##size, \
73+
AARCH64_INSN_LDST_##type##_IMM_OFFSET)
74+
#define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE)
75+
#define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
76+
#define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE)
77+
#define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
78+
#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
79+
#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
80+
#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
81+
#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
82+
6983
/* Load/store register pair */
7084
#define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
7185
aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \

arch/arm64/net/bpf_jit_comp.c

Lines changed: 113 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,47 @@ static bool is_addsub_imm(u32 imm)
191191
return !(imm & ~0xfff) || !(imm & ~0xfff000);
192192
}
193193

194+
/*
195+
* There are 3 types of AArch64 LDR/STR (immediate) instruction:
196+
* Post-index, Pre-index, Unsigned offset.
197+
*
198+
* For BPF ldr/str, the "unsigned offset" type is sufficient.
199+
*
200+
* "Unsigned offset" type LDR(immediate) format:
201+
*
202+
* 3 2 1 0
203+
* 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
204+
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
205+
* |x x|1 1 1 0 0 1 0 1| imm12 | Rn | Rt |
206+
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
207+
* scale
208+
*
209+
* "Unsigned offset" type STR(immediate) format:
210+
* 3 2 1 0
211+
* 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
212+
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
213+
* |x x|1 1 1 0 0 1 0 0| imm12 | Rn | Rt |
214+
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
215+
* scale
216+
*
217+
* The offset is calculated from imm12 and scale in the following way:
218+
*
219+
* offset = (u64)imm12 << scale
220+
*/
221+
static bool is_lsi_offset(s16 offset, int scale)
222+
{
223+
if (offset < 0)
224+
return false;
225+
226+
if (offset > (0xFFF << scale))
227+
return false;
228+
229+
if (offset & ((1 << scale) - 1))
230+
return false;
231+
232+
return true;
233+
}
234+
194235
/* Tail call offset to jump into */
195236
#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
196237
#define PROLOGUE_OFFSET 8
@@ -971,19 +1012,38 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
9711012
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
9721013
case BPF_LDX | BPF_PROBE_MEM | BPF_H:
9731014
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
974-
emit_a64_mov_i(1, tmp, off, ctx);
9751015
switch (BPF_SIZE(code)) {
9761016
case BPF_W:
977-
emit(A64_LDR32(dst, src, tmp), ctx);
1017+
if (is_lsi_offset(off, 2)) {
1018+
emit(A64_LDR32I(dst, src, off), ctx);
1019+
} else {
1020+
emit_a64_mov_i(1, tmp, off, ctx);
1021+
emit(A64_LDR32(dst, src, tmp), ctx);
1022+
}
9781023
break;
9791024
case BPF_H:
980-
emit(A64_LDRH(dst, src, tmp), ctx);
1025+
if (is_lsi_offset(off, 1)) {
1026+
emit(A64_LDRHI(dst, src, off), ctx);
1027+
} else {
1028+
emit_a64_mov_i(1, tmp, off, ctx);
1029+
emit(A64_LDRH(dst, src, tmp), ctx);
1030+
}
9811031
break;
9821032
case BPF_B:
983-
emit(A64_LDRB(dst, src, tmp), ctx);
1033+
if (is_lsi_offset(off, 0)) {
1034+
emit(A64_LDRBI(dst, src, off), ctx);
1035+
} else {
1036+
emit_a64_mov_i(1, tmp, off, ctx);
1037+
emit(A64_LDRB(dst, src, tmp), ctx);
1038+
}
9841039
break;
9851040
case BPF_DW:
986-
emit(A64_LDR64(dst, src, tmp), ctx);
1041+
if (is_lsi_offset(off, 3)) {
1042+
emit(A64_LDR64I(dst, src, off), ctx);
1043+
} else {
1044+
emit_a64_mov_i(1, tmp, off, ctx);
1045+
emit(A64_LDR64(dst, src, tmp), ctx);
1046+
}
9871047
break;
9881048
}
9891049

@@ -1011,20 +1071,39 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
10111071
case BPF_ST | BPF_MEM | BPF_B:
10121072
case BPF_ST | BPF_MEM | BPF_DW:
10131073
/* Load imm to a register then store it */
1014-
emit_a64_mov_i(1, tmp2, off, ctx);
10151074
emit_a64_mov_i(1, tmp, imm, ctx);
10161075
switch (BPF_SIZE(code)) {
10171076
case BPF_W:
1018-
emit(A64_STR32(tmp, dst, tmp2), ctx);
1077+
if (is_lsi_offset(off, 2)) {
1078+
emit(A64_STR32I(tmp, dst, off), ctx);
1079+
} else {
1080+
emit_a64_mov_i(1, tmp2, off, ctx);
1081+
emit(A64_STR32(tmp, dst, tmp2), ctx);
1082+
}
10191083
break;
10201084
case BPF_H:
1021-
emit(A64_STRH(tmp, dst, tmp2), ctx);
1085+
if (is_lsi_offset(off, 1)) {
1086+
emit(A64_STRHI(tmp, dst, off), ctx);
1087+
} else {
1088+
emit_a64_mov_i(1, tmp2, off, ctx);
1089+
emit(A64_STRH(tmp, dst, tmp2), ctx);
1090+
}
10221091
break;
10231092
case BPF_B:
1024-
emit(A64_STRB(tmp, dst, tmp2), ctx);
1093+
if (is_lsi_offset(off, 0)) {
1094+
emit(A64_STRBI(tmp, dst, off), ctx);
1095+
} else {
1096+
emit_a64_mov_i(1, tmp2, off, ctx);
1097+
emit(A64_STRB(tmp, dst, tmp2), ctx);
1098+
}
10251099
break;
10261100
case BPF_DW:
1027-
emit(A64_STR64(tmp, dst, tmp2), ctx);
1101+
if (is_lsi_offset(off, 3)) {
1102+
emit(A64_STR64I(tmp, dst, off), ctx);
1103+
} else {
1104+
emit_a64_mov_i(1, tmp2, off, ctx);
1105+
emit(A64_STR64(tmp, dst, tmp2), ctx);
1106+
}
10281107
break;
10291108
}
10301109
break;
@@ -1034,19 +1113,38 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
10341113
case BPF_STX | BPF_MEM | BPF_H:
10351114
case BPF_STX | BPF_MEM | BPF_B:
10361115
case BPF_STX | BPF_MEM | BPF_DW:
1037-
emit_a64_mov_i(1, tmp, off, ctx);
10381116
switch (BPF_SIZE(code)) {
10391117
case BPF_W:
1040-
emit(A64_STR32(src, dst, tmp), ctx);
1118+
if (is_lsi_offset(off, 2)) {
1119+
emit(A64_STR32I(src, dst, off), ctx);
1120+
} else {
1121+
emit_a64_mov_i(1, tmp, off, ctx);
1122+
emit(A64_STR32(src, dst, tmp), ctx);
1123+
}
10411124
break;
10421125
case BPF_H:
1043-
emit(A64_STRH(src, dst, tmp), ctx);
1126+
if (is_lsi_offset(off, 1)) {
1127+
emit(A64_STRHI(src, dst, off), ctx);
1128+
} else {
1129+
emit_a64_mov_i(1, tmp, off, ctx);
1130+
emit(A64_STRH(src, dst, tmp), ctx);
1131+
}
10441132
break;
10451133
case BPF_B:
1046-
emit(A64_STRB(src, dst, tmp), ctx);
1134+
if (is_lsi_offset(off, 0)) {
1135+
emit(A64_STRBI(src, dst, off), ctx);
1136+
} else {
1137+
emit_a64_mov_i(1, tmp, off, ctx);
1138+
emit(A64_STRB(src, dst, tmp), ctx);
1139+
}
10471140
break;
10481141
case BPF_DW:
1049-
emit(A64_STR64(src, dst, tmp), ctx);
1142+
if (is_lsi_offset(off, 3)) {
1143+
emit(A64_STR64I(src, dst, off), ctx);
1144+
} else {
1145+
emit_a64_mov_i(1, tmp, off, ctx);
1146+
emit(A64_STR64(src, dst, tmp), ctx);
1147+
}
10501148
break;
10511149
}
10521150
break;

0 commit comments

Comments
 (0)