Skip to content

Commit 38a4e5e

Browse files
chrismason-xxAndy Grover
authored andcommitted
rds: Use RCU for the bind lookup searches
The RDS bind lookups are somewhat expensive in terms of CPU time and locking overhead. This commit changes them into a faster RCU based hash tree instead of the rbtrees they were using before. On large NUMA systems it is a significant improvement. Signed-off-by: Chris Mason <[email protected]>
1 parent e4c52c9 commit 38a4e5e

File tree

4 files changed

+57
-45
lines changed

4 files changed

+57
-45
lines changed

net/rds/af_rds.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,15 @@ static int rds_release(struct socket *sock)
7272
* with the socket. */
7373
rds_clear_recv_queue(rs);
7474
rds_cong_remove_socket(rs);
75+
76+
/*
77+
* the binding lookup hash uses rcu, we need to
78+
* make sure we sychronize_rcu before we free our
79+
* entry
80+
*/
7581
rds_remove_bound(rs);
82+
synchronize_rcu();
83+
7684
rds_send_drop_to(rs, NULL);
7785
rds_rdma_drop_keys(rs);
7886
rds_notify_queue_get(rs, NULL);

net/rds/bind.c

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -34,45 +34,52 @@
3434
#include <net/sock.h>
3535
#include <linux/in.h>
3636
#include <linux/if_arp.h>
37+
#include <linux/jhash.h>
3738
#include "rds.h"
3839

39-
/*
40-
* XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
41-
* particularly zippy.
42-
*
43-
* This is now called for every incoming frame so we arguably care much more
44-
* about it than we used to.
45-
*/
46-
static DEFINE_RWLOCK(rds_bind_lock);
47-
static struct rb_root rds_bind_tree = RB_ROOT;
40+
#define BIND_HASH_SIZE 1024
41+
static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
42+
static DEFINE_SPINLOCK(rds_bind_lock);
43+
44+
static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
45+
{
46+
return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
47+
(BIND_HASH_SIZE - 1));
48+
}
4849

49-
static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
50-
struct rds_sock *insert)
50+
static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
51+
struct rds_sock *insert)
5152
{
52-
struct rb_node **p = &rds_bind_tree.rb_node;
53-
struct rb_node *parent = NULL;
5453
struct rds_sock *rs;
54+
struct hlist_node *node;
55+
struct hlist_head *head = hash_to_bucket(addr, port);
5556
u64 cmp;
5657
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
5758

58-
while (*p) {
59-
parent = *p;
60-
rs = rb_entry(parent, struct rds_sock, rs_bound_node);
61-
59+
rcu_read_lock();
60+
hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
6261
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
6362
be16_to_cpu(rs->rs_bound_port);
6463

65-
if (needle < cmp)
66-
p = &(*p)->rb_left;
67-
else if (needle > cmp)
68-
p = &(*p)->rb_right;
69-
else
64+
if (cmp == needle) {
65+
rcu_read_unlock();
7066
return rs;
67+
}
7168
}
69+
rcu_read_unlock();
7270

7371
if (insert) {
74-
rb_link_node(&insert->rs_bound_node, parent, p);
75-
rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
72+
/*
73+
* make sure our addr and port are set before
74+
* we are added to the list, other people
75+
* in rcu will find us as soon as the
76+
* hlist_add_head_rcu is done
77+
*/
78+
insert->rs_bound_addr = addr;
79+
insert->rs_bound_port = port;
80+
rds_sock_addref(insert);
81+
82+
hlist_add_head_rcu(&insert->rs_bound_node, head);
7683
}
7784
return NULL;
7885
}
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
8693
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
8794
{
8895
struct rds_sock *rs;
89-
unsigned long flags;
9096

91-
read_lock_irqsave(&rds_bind_lock, flags);
92-
rs = rds_bind_tree_walk(addr, port, NULL);
97+
rs = rds_bind_lookup(addr, port, NULL);
98+
9399
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
94100
rds_sock_addref(rs);
95101
else
96102
rs = NULL;
97-
read_unlock_irqrestore(&rds_bind_lock, flags);
98103

99104
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
100105
ntohs(port));
@@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
116121
last = rover - 1;
117122
}
118123

119-
write_lock_irqsave(&rds_bind_lock, flags);
124+
spin_lock_irqsave(&rds_bind_lock, flags);
120125

121126
do {
122127
if (rover == 0)
123128
rover++;
124-
if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) {
125-
*port = cpu_to_be16(rover);
129+
if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
130+
*port = rs->rs_bound_port;
126131
ret = 0;
132+
rdsdebug("rs %p binding to %pI4:%d\n",
133+
rs, &addr, (int)ntohs(*port));
127134
break;
128135
}
129136
} while (rover++ != last);
130137

131-
if (ret == 0) {
132-
rs->rs_bound_addr = addr;
133-
rs->rs_bound_port = *port;
134-
rds_sock_addref(rs);
135-
136-
rdsdebug("rs %p binding to %pI4:%d\n",
137-
rs, &addr, (int)ntohs(*port));
138-
}
139-
140-
write_unlock_irqrestore(&rds_bind_lock, flags);
138+
spin_unlock_irqrestore(&rds_bind_lock, flags);
141139

142140
return ret;
143141
}
@@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs)
146144
{
147145
unsigned long flags;
148146

149-
write_lock_irqsave(&rds_bind_lock, flags);
147+
spin_lock_irqsave(&rds_bind_lock, flags);
150148

151149
if (rs->rs_bound_addr) {
152150
rdsdebug("rs %p unbinding from %pI4:%d\n",
153151
rs, &rs->rs_bound_addr,
154152
ntohs(rs->rs_bound_port));
155153

156-
rb_erase(&rs->rs_bound_node, &rds_bind_tree);
154+
hlist_del_init_rcu(&rs->rs_bound_node);
157155
rds_sock_put(rs);
158156
rs->rs_bound_addr = 0;
159157
}
160158

161-
write_unlock_irqrestore(&rds_bind_lock, flags);
159+
spin_unlock_irqrestore(&rds_bind_lock, flags);
162160
}
163161

164162
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
198196

199197
out:
200198
release_sock(sk);
199+
200+
/* we might have called rds_remove_bound on error */
201+
if (ret)
202+
synchronize_rcu();
201203
return ret;
202204
}

net/rds/ib_rdma.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
303303
goto out_no_cigar;
304304
}
305305

306+
memset(ibmr, 0, sizeof(*ibmr));
307+
306308
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
307309
(IB_ACCESS_LOCAL_WRITE |
308310
IB_ACCESS_REMOTE_READ |

net/rds/rds.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ struct rds_sock {
452452
* bound_addr used for both incoming and outgoing, no INADDR_ANY
453453
* support.
454454
*/
455-
struct rb_node rs_bound_node;
455+
struct hlist_node rs_bound_node;
456456
__be32 rs_bound_addr;
457457
__be32 rs_conn_addr;
458458
__be16 rs_bound_port;

0 commit comments

Comments
 (0)