path: root/include/rdma
diff options
authorMatan Barak <matanb@mellanox.com>2015-07-30 18:33:26 +0300
committerDoug Ledford <dledford@redhat.com>2015-08-30 18:08:50 -0400
commit03db3a2d81e6e84f3ed3cb9e087cae17d762642b (patch)
tree772aff939dea090e0e7e2e4e37ca76c044d17403 /include/rdma
parent55aeed06544f675f25aef06a8c47b0b6b8850f4f (diff)
IB/core: Add RoCE GID table management
RoCE GIDs are based on IP addresses configured on Ethernet net-devices which relate to the RDMA (RoCE) device port. Currently, each of the low-level drivers that support RoCE (ocrdma, mlx4) manages its own RoCE port GID table. As there's nothing which is essentially vendor specific, we generalize that, and enhance the RDMA core GID cache to do this job. In order to populate the GID table, we listen for events: (a) netdev up/down/change_addr events - if a netdev is built onto our RoCE device, we need to add/delete its IPs. This involves adding all GIDs related to this ndev, add default GIDs, etc. (b) inet events - add new GIDs (according to the IP addresses) to the table. For programming the port RoCE GID table, providers must implement the add_gid and del_gid callbacks. RoCE GID management requires us to state the associated net_device alongside the GID. This information is necessary in order to manage the GID table. For example, when a net_device is removed, its associated GIDs need to be removed as well. RoCE mandates generating a default GID for each port, based on the related net-device's IPv6 link local. In contrast to the GID based on the regular IPv6 link-local (as we generate GID per IP address), the default GID is also available when the net device is down (in order to support loopback). Locking is done as follows: The patch modify the GID table code both for new RoCE drivers implementing the add_gid/del_gid callbacks and for current RoCE and IB drivers that do not. The flows for updating the table are different, so the locking requirements are too. While updating RoCE GID table, protection against multiple writers is achieved via mutex_lock(&table->lock). Since writing to a table requires us to find an entry (possible a free entry) in the table and then modify it, this mutex protects both the find_gid and write_gid ensuring the atomicity of the action. Each entry in the GID cache is protected by rwlock. In RoCE, writing (usually results from netdev notifier) involves invoking the vendor's add_gid and del_gid callbacks, which could sleep. Therefore, an invalid flag is added for each entry. Updates for RoCE are done via a workqueue, thus sleeping is permitted. In IB, updates are done in write_lock_irq(&device->cache.lock), thus write_gid isn't allowed to sleep and add_gid/del_gid are not called. When passing net-device into/out-of the GID cache, the device is always passed held (dev_hold). The code uses a single work item for updating all RDMA devices, following a netdev or inet notifier. The patch moves the cache from being a client (which was incorrect, as the cache is part of the IB infrastructure) to being explicitly initialized/freed when a device is registered/removed. Signed-off-by: Matan Barak <matanb@mellanox.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
Diffstat (limited to 'include/rdma')
1 files changed, 64 insertions, 2 deletions
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c3540da2731f..2de56834a6be 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -65,6 +65,10 @@ union ib_gid {
} global;
+struct ib_gid_attr {
+ struct net_device *ndev;
enum rdma_node_type {
/* IB values map to NodeInfo:NodeType. */
@@ -285,7 +289,7 @@ enum ib_port_cap_flags {
+ IB_PORT_IP_BASED_GIDS = 1 << 26,
enum ib_port_width {
@@ -1487,7 +1491,7 @@ struct ib_cache {
rwlock_t lock;
struct ib_event_handler event_handler;
struct ib_pkey_cache **pkey_cache;
- struct ib_gid_cache **gid_cache;
+ struct ib_gid_table **gid_cache;
u8 *lmc_cache;
@@ -1573,9 +1577,47 @@ struct ib_device {
struct ib_port_attr *port_attr);
enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
u8 port_num);
+ /* When calling get_netdev, the HW vendor's driver should return the
+ * net device of device @device at port @port_num or NULL if such
+ * a net device doesn't exist. The vendor driver should call dev_hold
+ * on this net device. The HW vendor's device driver must guarantee
+ * that this function returns NULL before the net device reaches
+ */
+ struct net_device *(*get_netdev)(struct ib_device *device,
+ u8 port_num);
int (*query_gid)(struct ib_device *device,
u8 port_num, int index,
union ib_gid *gid);
+ /* When calling add_gid, the HW vendor's driver should
+ * add the gid of device @device at gid index @index of
+ * port @port_num to be @gid. Meta-info of that gid (for example,
+ * the network device related to this gid is available
+ * at @attr. @context allows the HW vendor driver to store extra
+ * information together with a GID entry. The HW vendor may allocate
+ * memory to contain this information and store it in @context when a
+ * new GID entry is written to. Params are consistent until the next
+ * call of add_gid or delete_gid. The function should return 0 on
+ * success or error otherwise. The function could be called
+ * concurrently for different ports. This function is only called
+ * when roce_gid_table is used.
+ */
+ int (*add_gid)(struct ib_device *device,
+ u8 port_num,
+ unsigned int index,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ void **context);
+ /* When calling del_gid, the HW vendor's driver should delete the
+ * gid of device @device at gid index @index of port @port_num.
+ * Upon the deletion of a GID entry, the HW vendor must free any
+ * allocated memory. The caller will clear @context afterwards.
+ * This function is only called when roce_gid_table is used.
+ */
+ int (*del_gid)(struct ib_device *device,
+ u8 port_num,
+ unsigned int index,
+ void **context);
int (*query_pkey)(struct ib_device *device,
u8 port_num, u16 index, u16 *pkey);
int (*modify_device)(struct ib_device *device,
@@ -2108,6 +2150,26 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n
return device->port_immutable[port_num].max_mad_size;
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+ u8 port_num)
+ return rdma_protocol_roce(device, port_num) &&
+ device->add_gid && device->del_gid;
int ib_query_gid(struct ib_device *device,
u8 port_num, int index, union ib_gid *gid);

Privacy Policy