Skip to content

Commit 0579963

Browse files
yonghong-songAlexei Starovoitov
authored andcommitted
bpf: Add batch ops to all htab bpf map
htab can't use generic batch support due some problematic behaviours inherent to the data structre, i.e. while iterating the bpf map a concurrent program might delete the next entry that batch was about to use, in that case there's no easy solution to retrieve the next entry, the issue has been discussed multiple times (see [1] and [2]). The only way hmap can be traversed without the problem previously exposed is by making sure that the map is traversing entire buckets. This commit implements those strict requirements for hmap, the implementation follows the same interaction that generic support with some exceptions: - If keys/values buffer are not big enough to traverse a bucket, ENOSPC will be returned. - out_batch contains the value of the next bucket in the iteration, not the next key, but this is transparent for the user since the user should never use out_batch for other than bpf batch syscalls. This commits implements BPF_MAP_LOOKUP_BATCH and adds support for new command BPF_MAP_LOOKUP_AND_DELETE_BATCH. Note that for update/delete batch ops it is possible to use the generic implementations. [1] https://lore.kernel.org/bpf/[email protected]/ [2] https://lore.kernel.org/bpf/[email protected]/ Signed-off-by: Yonghong Song <[email protected]> Signed-off-by: Brian Vazquez <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent c60f2d2 commit 0579963

File tree

4 files changed

+276
-1
lines changed

4 files changed

+276
-1
lines changed

include/linux/bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ struct bpf_map_ops {
4646
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
4747
int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
4848
union bpf_attr __user *uattr);
49+
int (*map_lookup_and_delete_batch)(struct bpf_map *map,
50+
const union bpf_attr *attr,
51+
union bpf_attr __user *uattr);
4952
int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
5053
union bpf_attr __user *uattr);
5154
int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,

include/uapi/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ enum bpf_cmd {
108108
BPF_MAP_FREEZE,
109109
BPF_BTF_GET_NEXT_ID,
110110
BPF_MAP_LOOKUP_BATCH,
111+
BPF_MAP_LOOKUP_AND_DELETE_BATCH,
111112
BPF_MAP_UPDATE_BATCH,
112113
BPF_MAP_DELETE_BATCH,
113114
};

kernel/bpf/hashtab.c

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@
1717
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
1818
BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
1919

20+
#define BATCH_OPS(_name) \
21+
.map_lookup_batch = \
22+
_name##_map_lookup_batch, \
23+
.map_lookup_and_delete_batch = \
24+
_name##_map_lookup_and_delete_batch, \
25+
.map_update_batch = \
26+
generic_map_update_batch, \
27+
.map_delete_batch = \
28+
generic_map_delete_batch
29+
2030
struct bucket {
2131
struct hlist_nulls_head head;
2232
raw_spinlock_t lock;
@@ -1232,6 +1242,256 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
12321242
rcu_read_unlock();
12331243
}
12341244

1245+
static int
1246+
__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1247+
const union bpf_attr *attr,
1248+
union bpf_attr __user *uattr,
1249+
bool do_delete, bool is_lru_map,
1250+
bool is_percpu)
1251+
{
1252+
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1253+
u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1254+
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1255+
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1256+
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1257+
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1258+
u32 batch, max_count, size, bucket_size;
1259+
u64 elem_map_flags, map_flags;
1260+
struct hlist_nulls_head *head;
1261+
struct hlist_nulls_node *n;
1262+
unsigned long flags;
1263+
struct htab_elem *l;
1264+
struct bucket *b;
1265+
int ret = 0;
1266+
1267+
elem_map_flags = attr->batch.elem_flags;
1268+
if ((elem_map_flags & ~BPF_F_LOCK) ||
1269+
((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
1270+
return -EINVAL;
1271+
1272+
map_flags = attr->batch.flags;
1273+
if (map_flags)
1274+
return -EINVAL;
1275+
1276+
max_count = attr->batch.count;
1277+
if (!max_count)
1278+
return 0;
1279+
1280+
if (put_user(0, &uattr->batch.count))
1281+
return -EFAULT;
1282+
1283+
batch = 0;
1284+
if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1285+
return -EFAULT;
1286+
1287+
if (batch >= htab->n_buckets)
1288+
return -ENOENT;
1289+
1290+
key_size = htab->map.key_size;
1291+
roundup_key_size = round_up(htab->map.key_size, 8);
1292+
value_size = htab->map.value_size;
1293+
size = round_up(value_size, 8);
1294+
if (is_percpu)
1295+
value_size = size * num_possible_cpus();
1296+
total = 0;
1297+
/* while experimenting with hash tables with sizes ranging from 10 to
1298+
* 1000, it was observed that a bucket can have upto 5 entries.
1299+
*/
1300+
bucket_size = 5;
1301+
1302+
alloc:
1303+
/* We cannot do copy_from_user or copy_to_user inside
1304+
* the rcu_read_lock. Allocate enough space here.
1305+
*/
1306+
keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
1307+
values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
1308+
if (!keys || !values) {
1309+
ret = -ENOMEM;
1310+
goto after_loop;
1311+
}
1312+
1313+
again:
1314+
preempt_disable();
1315+
this_cpu_inc(bpf_prog_active);
1316+
rcu_read_lock();
1317+
again_nocopy:
1318+
dst_key = keys;
1319+
dst_val = values;
1320+
b = &htab->buckets[batch];
1321+
head = &b->head;
1322+
raw_spin_lock_irqsave(&b->lock, flags);
1323+
1324+
bucket_cnt = 0;
1325+
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1326+
bucket_cnt++;
1327+
1328+
if (bucket_cnt > (max_count - total)) {
1329+
if (total == 0)
1330+
ret = -ENOSPC;
1331+
raw_spin_unlock_irqrestore(&b->lock, flags);
1332+
rcu_read_unlock();
1333+
this_cpu_dec(bpf_prog_active);
1334+
preempt_enable();
1335+
goto after_loop;
1336+
}
1337+
1338+
if (bucket_cnt > bucket_size) {
1339+
bucket_size = bucket_cnt;
1340+
raw_spin_unlock_irqrestore(&b->lock, flags);
1341+
rcu_read_unlock();
1342+
this_cpu_dec(bpf_prog_active);
1343+
preempt_enable();
1344+
kvfree(keys);
1345+
kvfree(values);
1346+
goto alloc;
1347+
}
1348+
1349+
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1350+
memcpy(dst_key, l->key, key_size);
1351+
1352+
if (is_percpu) {
1353+
int off = 0, cpu;
1354+
void __percpu *pptr;
1355+
1356+
pptr = htab_elem_get_ptr(l, map->key_size);
1357+
for_each_possible_cpu(cpu) {
1358+
bpf_long_memcpy(dst_val + off,
1359+
per_cpu_ptr(pptr, cpu), size);
1360+
off += size;
1361+
}
1362+
} else {
1363+
value = l->key + roundup_key_size;
1364+
if (elem_map_flags & BPF_F_LOCK)
1365+
copy_map_value_locked(map, dst_val, value,
1366+
true);
1367+
else
1368+
copy_map_value(map, dst_val, value);
1369+
check_and_init_map_lock(map, dst_val);
1370+
}
1371+
if (do_delete) {
1372+
hlist_nulls_del_rcu(&l->hash_node);
1373+
if (is_lru_map)
1374+
bpf_lru_push_free(&htab->lru, &l->lru_node);
1375+
else
1376+
free_htab_elem(htab, l);
1377+
}
1378+
dst_key += key_size;
1379+
dst_val += value_size;
1380+
}
1381+
1382+
raw_spin_unlock_irqrestore(&b->lock, flags);
1383+
/* If we are not copying data, we can go to next bucket and avoid
1384+
* unlocking the rcu.
1385+
*/
1386+
if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1387+
batch++;
1388+
goto again_nocopy;
1389+
}
1390+
1391+
rcu_read_unlock();
1392+
this_cpu_dec(bpf_prog_active);
1393+
preempt_enable();
1394+
if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1395+
key_size * bucket_cnt) ||
1396+
copy_to_user(uvalues + total * value_size, values,
1397+
value_size * bucket_cnt))) {
1398+
ret = -EFAULT;
1399+
goto after_loop;
1400+
}
1401+
1402+
total += bucket_cnt;
1403+
batch++;
1404+
if (batch >= htab->n_buckets) {
1405+
ret = -ENOENT;
1406+
goto after_loop;
1407+
}
1408+
goto again;
1409+
1410+
after_loop:
1411+
if (ret == -EFAULT)
1412+
goto out;
1413+
1414+
/* copy # of entries and next batch */
1415+
ubatch = u64_to_user_ptr(attr->batch.out_batch);
1416+
if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1417+
put_user(total, &uattr->batch.count))
1418+
ret = -EFAULT;
1419+
1420+
out:
1421+
kvfree(keys);
1422+
kvfree(values);
1423+
return ret;
1424+
}
1425+
1426+
static int
1427+
htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1428+
union bpf_attr __user *uattr)
1429+
{
1430+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1431+
false, true);
1432+
}
1433+
1434+
static int
1435+
htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1436+
const union bpf_attr *attr,
1437+
union bpf_attr __user *uattr)
1438+
{
1439+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1440+
false, true);
1441+
}
1442+
1443+
static int
1444+
htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1445+
union bpf_attr __user *uattr)
1446+
{
1447+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1448+
false, false);
1449+
}
1450+
1451+
static int
1452+
htab_map_lookup_and_delete_batch(struct bpf_map *map,
1453+
const union bpf_attr *attr,
1454+
union bpf_attr __user *uattr)
1455+
{
1456+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1457+
false, false);
1458+
}
1459+
1460+
static int
1461+
htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1462+
const union bpf_attr *attr,
1463+
union bpf_attr __user *uattr)
1464+
{
1465+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1466+
true, true);
1467+
}
1468+
1469+
static int
1470+
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1471+
const union bpf_attr *attr,
1472+
union bpf_attr __user *uattr)
1473+
{
1474+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1475+
true, true);
1476+
}
1477+
1478+
static int
1479+
htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1480+
union bpf_attr __user *uattr)
1481+
{
1482+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1483+
true, false);
1484+
}
1485+
1486+
static int
1487+
htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1488+
const union bpf_attr *attr,
1489+
union bpf_attr __user *uattr)
1490+
{
1491+
return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1492+
true, false);
1493+
}
1494+
12351495
const struct bpf_map_ops htab_map_ops = {
12361496
.map_alloc_check = htab_map_alloc_check,
12371497
.map_alloc = htab_map_alloc,
@@ -1242,6 +1502,7 @@ const struct bpf_map_ops htab_map_ops = {
12421502
.map_delete_elem = htab_map_delete_elem,
12431503
.map_gen_lookup = htab_map_gen_lookup,
12441504
.map_seq_show_elem = htab_map_seq_show_elem,
1505+
BATCH_OPS(htab),
12451506
};
12461507

12471508
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1516,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
12551516
.map_delete_elem = htab_lru_map_delete_elem,
12561517
.map_gen_lookup = htab_lru_map_gen_lookup,
12571518
.map_seq_show_elem = htab_map_seq_show_elem,
1519+
BATCH_OPS(htab_lru),
12581520
};
12591521

12601522
/* Called from eBPF program */
@@ -1368,6 +1630,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
13681630
.map_update_elem = htab_percpu_map_update_elem,
13691631
.map_delete_elem = htab_map_delete_elem,
13701632
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
1633+
BATCH_OPS(htab_percpu),
13711634
};
13721635

13731636
const struct bpf_map_ops htab_lru_percpu_map_ops = {
@@ -1379,6 +1642,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
13791642
.map_update_elem = htab_lru_percpu_map_update_elem,
13801643
.map_delete_elem = htab_lru_map_delete_elem,
13811644
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
1645+
BATCH_OPS(htab_lru_percpu),
13821646
};
13831647

13841648
static int fd_htab_map_alloc_check(union bpf_attr *attr)

kernel/bpf/syscall.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3310,7 +3310,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
33103310
if (IS_ERR(map))
33113311
return PTR_ERR(map);
33123312

3313-
if (cmd == BPF_MAP_LOOKUP_BATCH &&
3313+
if ((cmd == BPF_MAP_LOOKUP_BATCH ||
3314+
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
33143315
!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
33153316
err = -EPERM;
33163317
goto err_put;
@@ -3324,6 +3325,8 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
33243325

33253326
if (cmd == BPF_MAP_LOOKUP_BATCH)
33263327
BPF_DO_BATCH(map->ops->map_lookup_batch);
3328+
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
3329+
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
33273330
else if (cmd == BPF_MAP_UPDATE_BATCH)
33283331
BPF_DO_BATCH(map->ops->map_update_batch);
33293332
else
@@ -3434,6 +3437,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
34343437
case BPF_MAP_LOOKUP_BATCH:
34353438
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
34363439
break;
3440+
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
3441+
err = bpf_map_do_batch(&attr, uattr,
3442+
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
3443+
break;
34373444
case BPF_MAP_UPDATE_BATCH:
34383445
err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
34393446
break;

0 commit comments

Comments
 (0)