[PATCH] iscsi: Perform connection failure entirely in kernel space

Gabriel Krisman Bertazi

unread,

Dec 9, 2019, 1:21:04 PM12/9/19

to ldu...@suse.com, cle...@redhat.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov, Gabriel Krisman Bertazi

From: Bharath Ravi <rbha...@google.com>

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery. This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch implements an optional feature in the iscsi module, to
perform the connection failure inside the kernel. This way, the
failover can happen and pending IO can continue even if the daemon is
dead. Once the daemon comes alive again, it can perform recovery
procedures if applicable.

Co-developed-by: Dave Clausen <dcla...@google.com>
Signed-off-by: Dave Clausen <dcla...@google.com>
Co-developed-by: Nick Black <n...@google.com>
Signed-off-by: Nick Black <n...@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnaga...@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnaga...@google.com>
Co-developed-by: Anatol Pomazau <ana...@google.com>
Signed-off-by: Anatol Pomazau <ana...@google.com>
Co-developed-by: Tahsin Erdogan <tah...@google.com>
Signed-off-by: Tahsin Erdogan <tah...@google.com>
Co-developed-by: Frank Mayhar <fma...@google.com>
Signed-off-by: Frank Mayhar <fma...@google.com>
Co-developed-by: Junho Ryu <ja...@google.com>
Signed-off-by: Junho Ryu <ja...@google.com>
Co-developed-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Bharath Ravi <rbha...@google.com>
Co-developed-by: Gabriel Krisman Bertazi <kri...@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <kri...@collabora.com>
---
drivers/scsi/scsi_transport_iscsi.c | 46 +++++++++++++++++++++++++++++
include/scsi/scsi_transport_iscsi.h | 1 +
2 files changed, 47 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 417b868d8735..7251b2b5b272 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -36,6 +36,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(iscsi_dbg_session);
EXPORT_TRACEPOINT_SYMBOL_GPL(iscsi_dbg_tcp);
EXPORT_TRACEPOINT_SYMBOL_GPL(iscsi_dbg_sw_tcp);

+static bool kern_conn_failure;
+module_param(kern_conn_failure, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(kern_conn_failure,
+ "Allow the kernel to detect and disable broken connections "
+ "without requiring userspace intervention");
+
static int dbg_session;
module_param_named(debug_session, dbg_session, int,
S_IRUGO | S_IWUSR);
@@ -84,6 +90,12 @@ struct iscsi_internal {
struct transport_container session_cont;
};

+/* Worker to perform connection failure on unresponsive connections
+ * completely in kernel space.
+ */
+static void stop_conn_work_fn(struct work_struct *work);
+static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
+
static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
static struct workqueue_struct *iscsi_eh_timer_workq;

@@ -1609,6 +1621,7 @@ static DEFINE_MUTEX(rx_queue_mutex);
static LIST_HEAD(sesslist);
static DEFINE_SPINLOCK(sesslock);
static LIST_HEAD(connlist);
+static LIST_HEAD(connlist_err);
static DEFINE_SPINLOCK(connlock);

static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)
@@ -2245,6 +2258,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)

mutex_init(&conn->ep_mutex);
INIT_LIST_HEAD(&conn->conn_list);
+ INIT_LIST_HEAD(&conn->conn_list_err);
conn->transport = transport;
conn->cid = cid;

@@ -2291,6 +2305,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)

spin_lock_irqsave(&connlock, flags);
list_del(&conn->conn_list);
+ list_del(&conn->conn_list_err);
spin_unlock_irqrestore(&connlock, flags);

transport_unregister_device(&conn->dev);
@@ -2405,6 +2420,28 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
}
EXPORT_SYMBOL_GPL(iscsi_offload_mesg);

+static void stop_conn_work_fn(struct work_struct *work)
+{
+ struct iscsi_cls_conn *conn, *tmp;
+ unsigned long flags;
+ LIST_HEAD(recovery_list);
+
+ spin_lock_irqsave(&connlock, flags);
+ if (list_empty(&connlist_err)) {
+ spin_unlock_irqrestore(&connlock, flags);
+ return;
+ }
+ list_splice_init(&connlist_err, &recovery_list);
+ spin_unlock_irqrestore(&connlock, flags);
+
+ mutex_lock(&rx_queue_mutex);
+ list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {
+ conn->transport->stop_conn(conn, STOP_CONN_RECOVER);
+ list_del_init(&conn->conn_list_err);
+ }
+ mutex_unlock(&rx_queue_mutex);
+}
+
void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
{
struct nlmsghdr *nlh;
@@ -2412,6 +2449,15 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
struct iscsi_uevent *ev;
struct iscsi_internal *priv;
int len = nlmsg_total_size(sizeof(*ev));
+ unsigned long flags;
+
+ if (kern_conn_failure) {
+ spin_lock_irqsave(&connlock, flags);
+ list_add(&conn->conn_list_err, &connlist_err);
+ spin_unlock_irqrestore(&connlock, flags);
+
+ queue_work(system_unbound_wq, &stop_conn_work);
+ }

priv = iscsi_if_transport_lookup(conn->transport);
if (!priv)
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 325ae731d9ad..2129dc9e2dec 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,

struct iscsi_cls_conn {
struct list_head conn_list; /* item in connlist */
+ struct list_head conn_list_err; /* item in connlist_err */
void *dd_data; /* LLD private data */
struct iscsi_transport *transport;
uint32_t cid; /* connection id */
--
2.24.0

Lee Duncan

unread,

Dec 9, 2019, 1:30:08 PM12/9/19

to Gabriel Krisman Bertazi, cle...@redhat.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

On 12/9/19 10:20 AM, Gabriel Krisman Bertazi wrote:
> From: Bharath Ravi <rbha...@google.com>
>
> Connection failure processing depends on a daemon being present to (at
> least) stop the connection and start recovery. This is a problem on a
> multipath scenario, where if the daemon failed for whatever reason, the
> SCSI path is never marked as down, multipath won't perform the
> failover and IO to the device will be forever waiting for that
> connection to come back.
>
> This patch implements an optional feature in the iscsi module, to
> perform the connection failure inside the kernel. This way, the
> failover can happen and pending IO can continue even if the daemon is
> dead. Once the daemon comes alive again, it can perform recovery
> procedures if applicable.

I don't suppose you've tested how this plays with the daemon, when present?

Gabriel Krisman Bertazi

unread,

Dec 9, 2019, 1:39:20 PM12/9/19

to Lee Duncan, cle...@redhat.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Lee Duncan <LDu...@suse.com> writes:

> On 12/9/19 10:20 AM, Gabriel Krisman Bertazi wrote:
>> From: Bharath Ravi <rbha...@google.com>
>>
>> Connection failure processing depends on a daemon being present to (at
>> least) stop the connection and start recovery. This is a problem on a
>> multipath scenario, where if the daemon failed for whatever reason, the
>> SCSI path is never marked as down, multipath won't perform the
>> failover and IO to the device will be forever waiting for that
>> connection to come back.
>>
>> This patch implements an optional feature in the iscsi module, to
>> perform the connection failure inside the kernel. This way, the
>> failover can happen and pending IO can continue even if the daemon is
>> dead. Once the daemon comes alive again, it can perform recovery
>> procedures if applicable.
>
>
> I don't suppose you've tested how this plays with the daemon, when present?

Hello,

Yes, I did. It seemed to work properly over TCP.

The stop calls are serialized by the rx_mutex, whoever gets it first,
gets the job done. The second should return immediately since the socket
is no longer there.

What am I missing?

--
Gabriel Krisman Bertazi

Mike Christie

unread,

Dec 10, 2019, 7:05:27 PM12/10/19

to Gabriel Krisman Bertazi, ldu...@suse.com, cle...@redhat.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Do you need the modparam? I think you could handle this issue and the
similar one during shutdown at the same time, and you would always want
to do the kernel based error handler when userspace is not answering for
both cases.

You could do the following:

- Modify __iscsi_block_session so it does the stop_conn callout instead
of reverse, and change the iscsi_stop_conn/ISCSI_UEVENT_STOP_CONN:
related code accordingly.

- In iscsi_conn_error_event you would then do:

iscsi_multicast_skb();
iscsi_block_session();

- You can then drop the system_state check in iscsi_eh_cmd_timed_out
because those running commands are always handled by the stop_conn call
in __iscsi_block_session now.

Mike Christie

unread,

Dec 10, 2019, 7:23:16 PM12/10/19

to Gabriel Krisman Bertazi, ldu...@suse.com, cle...@redhat.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Oh yeah, on second thought, I think I like how your new function above
calls into the stop_conn callout and everything works like it did
before. Ignore the __iscsi_block_session changes. But, I would drop the
modparam, always queue the work, and then fix up the system_state check.

Gabriel Krisman Bertazi

unread,

Dec 16, 2019, 8:29:10 PM12/16/19

to mchr...@redhat.com, martin....@oracle.com, cle...@redhat.com, open-...@googlegroups.com, linux...@vger.kernel.org, ker...@collabora.com, Lee Duncan, Bart Van Assche, Gabriel Krisman Bertazi

From: Bharath Ravi <rbha...@google.com>

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery. This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch implements an optional feature in the iscsi module, to
perform the connection failure inside the kernel. This way, the
failover can happen and pending IO can continue even if the daemon is
dead. Once the daemon comes alive again, it can perform recovery
procedures if applicable.

Changes since v1:
- Remove module parameter.
- Always do kernel-side stop work.
- Block recovery timeout handler if system is dying.
- send a CONN_TERM stop if the system is dying.

Cc: Mike Christie <mchr...@redhat.com>
Cc: Lee Duncan <LDu...@suse.com>
Cc: Bart Van Assche <bvana...@acm.org>

Co-developed-by: Dave Clausen <dcla...@google.com>
Signed-off-by: Dave Clausen <dcla...@google.com>
Co-developed-by: Nick Black <n...@google.com>
Signed-off-by: Nick Black <n...@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnaga...@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnaga...@google.com>
Co-developed-by: Anatol Pomazau <ana...@google.com>
Signed-off-by: Anatol Pomazau <ana...@google.com>
Co-developed-by: Tahsin Erdogan <tah...@google.com>
Signed-off-by: Tahsin Erdogan <tah...@google.com>
Co-developed-by: Frank Mayhar <fma...@google.com>
Signed-off-by: Frank Mayhar <fma...@google.com>
Co-developed-by: Junho Ryu <ja...@google.com>
Signed-off-by: Junho Ryu <ja...@google.com>
Co-developed-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Bharath Ravi <rbha...@google.com>
Co-developed-by: Gabriel Krisman Bertazi <kri...@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <kri...@collabora.com>
---

drivers/scsi/scsi_transport_iscsi.c | 51 +++++++++++++++++++++++++++++
include/scsi/scsi_transport_iscsi.h | 1 +
2 files changed, 52 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 417b868d8735..8da35ecad697 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -84,6 +84,12 @@ struct iscsi_internal {

struct transport_container session_cont;
};

+/* Worker to perform connection failure on unresponsive connections
+ * completely in kernel space.
+ */
+static void stop_conn_work_fn(struct work_struct *work);
+static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
+
static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
static struct workqueue_struct *iscsi_eh_timer_workq;

@@ -1609,6 +1615,7 @@ static DEFINE_MUTEX(rx_queue_mutex);

static LIST_HEAD(sesslist);
static DEFINE_SPINLOCK(sesslock);
static LIST_HEAD(connlist);
+static LIST_HEAD(connlist_err);
static DEFINE_SPINLOCK(connlock);

static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)

@@ -2245,6 +2252,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)

mutex_init(&conn->ep_mutex);
INIT_LIST_HEAD(&conn->conn_list);
+ INIT_LIST_HEAD(&conn->conn_list_err);
conn->transport = transport;
conn->cid = cid;

@@ -2291,6 +2299,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)

spin_lock_irqsave(&connlock, flags);
list_del(&conn->conn_list);
+ list_del(&conn->conn_list_err);
spin_unlock_irqrestore(&connlock, flags);

transport_unregister_device(&conn->dev);

@@ -2405,6 +2414,42 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,

}
EXPORT_SYMBOL_GPL(iscsi_offload_mesg);

+static void stop_conn_work_fn(struct work_struct *work)
+{
+ struct iscsi_cls_conn *conn, *tmp;
+ unsigned long flags;
+ LIST_HEAD(recovery_list);
+
+ spin_lock_irqsave(&connlock, flags);
+ if (list_empty(&connlist_err)) {
+ spin_unlock_irqrestore(&connlock, flags);
+ return;
+ }
+ list_splice_init(&connlist_err, &recovery_list);
+ spin_unlock_irqrestore(&connlock, flags);
+
+ mutex_lock(&rx_queue_mutex);
+ list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {

+ uint32_t sid = iscsi_conn_get_sid(conn);
+ struct iscsi_cls_session *session = iscsi_session_lookup(sid);
+
+ if (!session) {
+ list_del_init(&conn->conn_list_err);
+ continue;
+ }
+
+ if (system_state != SYSTEM_RUNNING) {
+ session->recovery_tmo = 0;
+ conn->transport->stop_conn(conn, STOP_CONN_TERM);
+ } else {

+ conn->transport->stop_conn(conn, STOP_CONN_RECOVER);
+ }

+

+ list_del_init(&conn->conn_list_err);
+ }
+ mutex_unlock(&rx_queue_mutex);
+}
+
void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
{
struct nlmsghdr *nlh;

@@ -2412,6 +2457,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)

struct iscsi_uevent *ev;
struct iscsi_internal *priv;
int len = nlmsg_total_size(sizeof(*ev));
+ unsigned long flags;
+

+ spin_lock_irqsave(&connlock, flags);
+ list_add(&conn->conn_list_err, &connlist_err);
+ spin_unlock_irqrestore(&connlock, flags);

+ queue_work(system_unbound_wq, &stop_conn_work);

priv = iscsi_if_transport_lookup(conn->transport);
if (!priv)
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 325ae731d9ad..2129dc9e2dec 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,

struct iscsi_cls_conn {
struct list_head conn_list; /* item in connlist */
+ struct list_head conn_list_err; /* item in connlist_err */
void *dd_data; /* LLD private data */
struct iscsi_transport *transport;
uint32_t cid; /* connection id */

--
2.24.0

Khazhismel Kumykov

unread,

Dec 16, 2019, 9:16:46 PM12/16/19

to Gabriel Krisman Bertazi, ldu...@suse.com, Chris Leech, Martin K. Petersen, linux...@vger.kernel.org, 'Khazhismel Kumykov' via open-iscsi, Bharath Ravi, ker...@collabora.com, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

Holding rx_queue_mutex for the entire conn_list_err may be problematic
for long conn_list_err, could we drop on need_resched perhaps, or otherwise
limit how long we hold this?

Gabriel Krisman Bertazi

unread,

Dec 26, 2019, 3:48:03 PM12/26/19

to ldu...@suse.com, cle...@redhat.com, je...@linux.ibm.com, martin....@oracle.com, open-...@googlegroups.com, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Lee Duncan, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov, Gabriel Krisman Bertazi

From: Bharath Ravi <rbha...@google.com>

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery. This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch performs the connection failure entirely inside the kernel.

This way, the failover can happen and pending IO can continue even if

the daemon is dead. Once the daemon comes alive again, it can execute
recovery procedures if applicable.

Changes since v2:
- Don't hold rx_mutex for too long at once

Changes since v1:
- Remove module parameter.
- Always do kernel-side stop work.
- Block recovery timeout handler if system is dying.
- send a CONN_TERM stop if the system is dying.

Cc: Mike Christie <mchr...@redhat.com>
Cc: Lee Duncan <LDu...@suse.com>
Cc: Bart Van Assche <bvana...@acm.org>

Co-developed-by: Dave Clausen <dcla...@google.com>
Signed-off-by: Dave Clausen <dcla...@google.com>
Co-developed-by: Nick Black <n...@google.com>
Signed-off-by: Nick Black <n...@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnaga...@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnaga...@google.com>
Co-developed-by: Anatol Pomazau <ana...@google.com>
Signed-off-by: Anatol Pomazau <ana...@google.com>
Co-developed-by: Tahsin Erdogan <tah...@google.com>
Signed-off-by: Tahsin Erdogan <tah...@google.com>
Co-developed-by: Frank Mayhar <fma...@google.com>
Signed-off-by: Frank Mayhar <fma...@google.com>
Co-developed-by: Junho Ryu <ja...@google.com>
Signed-off-by: Junho Ryu <ja...@google.com>
Co-developed-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Bharath Ravi <rbha...@google.com>
Co-developed-by: Gabriel Krisman Bertazi <kri...@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <kri...@collabora.com>
---

drivers/scsi/scsi_transport_iscsi.c | 63 +++++++++++++++++++++++++++++
include/scsi/scsi_transport_iscsi.h | 1 +
2 files changed, 64 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 271afea654e2..c6db6ded60a1 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -86,6 +86,12 @@ struct iscsi_internal {

struct transport_container session_cont;
};

+/* Worker to perform connection failure on unresponsive connections
+ * completely in kernel space.
+ */
+static void stop_conn_work_fn(struct work_struct *work);
+static DECLARE_WORK(stop_conn_work, stop_conn_work_fn);
+
static atomic_t iscsi_session_nr; /* sysfs session id for next new session */
static struct workqueue_struct *iscsi_eh_timer_workq;

@@ -1611,6 +1617,7 @@ static DEFINE_MUTEX(rx_queue_mutex);

static LIST_HEAD(sesslist);
static DEFINE_SPINLOCK(sesslock);
static LIST_HEAD(connlist);
+static LIST_HEAD(connlist_err);
static DEFINE_SPINLOCK(connlock);

static uint32_t iscsi_conn_get_sid(struct iscsi_cls_conn *conn)

@@ -2247,6 +2254,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid)

mutex_init(&conn->ep_mutex);
INIT_LIST_HEAD(&conn->conn_list);
+ INIT_LIST_HEAD(&conn->conn_list_err);
conn->transport = transport;
conn->cid = cid;

@@ -2293,6 +2301,7 @@ int iscsi_destroy_conn(struct iscsi_cls_conn *conn)

spin_lock_irqsave(&connlock, flags);
list_del(&conn->conn_list);
+ list_del(&conn->conn_list_err);
spin_unlock_irqrestore(&connlock, flags);

transport_unregister_device(&conn->dev);

@@ -2407,6 +2416,51 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,

}
EXPORT_SYMBOL_GPL(iscsi_offload_mesg);

+static void stop_conn_work_fn(struct work_struct *work)
+{
+ struct iscsi_cls_conn *conn, *tmp;
+ unsigned long flags;
+ LIST_HEAD(recovery_list);
+
+ spin_lock_irqsave(&connlock, flags);
+ if (list_empty(&connlist_err)) {
+ spin_unlock_irqrestore(&connlock, flags);
+ return;
+ }
+ list_splice_init(&connlist_err, &recovery_list);
+ spin_unlock_irqrestore(&connlock, flags);
+

+ list_for_each_entry_safe(conn, tmp, &recovery_list, conn_list_err) {

+ uint32_t sid = iscsi_conn_get_sid(conn);

+ struct iscsi_cls_session *session;
+
+ mutex_lock(&rx_queue_mutex);
+
+ session = iscsi_session_lookup(sid);
+ if (session) {

+ if (system_state != SYSTEM_RUNNING) {
+ session->recovery_tmo = 0;
+ conn->transport->stop_conn(conn,

+ STOP_CONN_TERM);

+ } else {
+ conn->transport->stop_conn(conn,

+ STOP_CONN_RECOVER);
+ }
+ }
+

+ list_del_init(&conn->conn_list_err);
+
+ mutex_unlock(&rx_queue_mutex);

+
+ /* we don't want to hold rx_queue_mutex for too long,
+ * for instance if many conns failed at the same time,
+ * since this stall other iscsi maintenance operations.
+ * Give other users a chance to proceed.
+ */
+ cond_resched();
+ }

+}
+
void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)
{
struct nlmsghdr *nlh;

@@ -2414,6 +2468,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error)

struct iscsi_uevent *ev;
struct iscsi_internal *priv;
int len = nlmsg_total_size(sizeof(*ev));
+ unsigned long flags;
+

+ spin_lock_irqsave(&connlock, flags);
+ list_add(&conn->conn_list_err, &connlist_err);
+ spin_unlock_irqrestore(&connlock, flags);

+ queue_work(system_unbound_wq, &stop_conn_work);

priv = iscsi_if_transport_lookup(conn->transport);
if (!priv)

@@ -2748,6 +2808,9 @@ iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev
if (!conn)
return -EINVAL;

+ if (!list_empty(&conn->conn_list_err))
+ return -EAGAIN;
+
ISCSI_DBG_TRANS_CONN(conn, "Destroying transport conn\n");
if (transport->destroy_conn)
transport->destroy_conn(conn);

diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 325ae731d9ad..2129dc9e2dec 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -190,6 +190,7 @@ extern void iscsi_ping_comp_event(uint32_t host_no,

struct iscsi_cls_conn {
struct list_head conn_list; /* item in connlist */
+ struct list_head conn_list_err; /* item in connlist_err */
void *dd_data; /* LLD private data */
struct iscsi_transport *transport;
uint32_t cid; /* connection id */
--

2.24.1

Lee Duncan

unread,

Jan 1, 2020, 1:46:35 PM1/1/20

to Gabriel Krisman Bertazi, cle...@redhat.com, je...@linux.ibm.com, martin....@oracle.com, open-...@googlegroups.com, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Reviewed-by: Lee Duncan <ldu...@suse.com>

Khazhismel Kumykov

unread,

Jan 2, 2020, 12:08:06 PM1/2/20

to Gabriel Krisman Bertazi, ldu...@suse.com, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

This worried me a bit, but it seems we won't destroy_conn while it's
on the err list - cool.

Does this check need to be under connlock?

Gabriel Krisman Bertazi

unread,

Jan 2, 2020, 1:13:42 PM1/2/20

to Khazhismel Kumykov, ldu...@suse.com, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

My understanding is that it is not necessary, since it is serialized
against the conn removal itself, through the rx_mutex, it seemed safe to
do the verification lockless.

It can only race with the insertion, in which case, it will be safely
removed from the dispatch list here, under rx_mutex, and the worker will
detect and skipped it.

--
Gabriel Krisman Bertazi

Khazhismel Kumykov

unread,

Jan 2, 2020, 1:24:53 PM1/2/20

to Gabriel Krisman Bertazi, ldu...@suse.com, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

On Thu, Jan 2, 2020 at 1:13 PM Gabriel Krisman Bertazi

My worry is the splice, which is under only connlock, not rx_mutex, which
might lead to UB if we're checking empty while modifying the list_head ?

>
> --
> Gabriel Krisman Bertazi

Gabriel Krisman Bertazi

unread,

Jan 3, 2020, 2:26:17 PM1/3/20

to Khazhismel Kumykov, ldu...@suse.com, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

Khazhismel Kumykov <kha...@google.com> writes:

>> >> + if (!list_empty(&conn->conn_list_err))
>> > Does this check need to be under connlock?
>>
>> My understanding is that it is not necessary, since it is serialized
>> against the conn removal itself, through the rx_mutex, it seemed safe to
>> do the verification lockless.
>>
>> It can only race with the insertion, in which case, it will be safely
>> removed from the dispatch list here, under rx_mutex, and the worker will
>> detect and skipped it.
>
> My worry is the splice, which is under only connlock, not rx_mutex, which
> might lead to UB if we're checking empty while modifying the list_head ?

Hi,

Please consider the v4 below with the lock added.

-- >8 --

From: Bharath Ravi <rbha...@google.com>

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery. This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch performs the connection failure entirely inside the kernel.
This way, the failover can happen and pending IO can continue even if
the daemon is dead. Once the daemon comes alive again, it can execute
recovery procedures if applicable.

Changes since v3:
- Protect list_empty with connlock on session destroy

Changes since v2:
- Don't hold rx_mutex for too long at once

Changes since v1:
- Remove module parameter.
- Always do kernel-side stop work.
- Block recovery timeout handler if system is dying.
- send a CONN_TERM stop if the system is dying.

Cc: Mike Christie <mchr...@redhat.com>
Cc: Lee Duncan <LDu...@suse.com>
Cc: Bart Van Assche <bvana...@acm.org>

Reviewed-by: Lee Duncan <ldu...@suse.com>

Co-developed-by: Dave Clausen <dcla...@google.com>
Signed-off-by: Dave Clausen <dcla...@google.com>
Co-developed-by: Nick Black <n...@google.com>
Signed-off-by: Nick Black <n...@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnaga...@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnaga...@google.com>
Co-developed-by: Anatol Pomazau <ana...@google.com>
Signed-off-by: Anatol Pomazau <ana...@google.com>
Co-developed-by: Tahsin Erdogan <tah...@google.com>
Signed-off-by: Tahsin Erdogan <tah...@google.com>
Co-developed-by: Frank Mayhar <fma...@google.com>
Signed-off-by: Frank Mayhar <fma...@google.com>
Co-developed-by: Junho Ryu <ja...@google.com>
Signed-off-by: Junho Ryu <ja...@google.com>
Co-developed-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Bharath Ravi <rbha...@google.com>
Co-developed-by: Gabriel Krisman Bertazi <kri...@collabora.com>
Signed-off-by: Gabriel Krisman Bertazi <kri...@collabora.com>
---

drivers/scsi/scsi_transport_iscsi.c | 68 +++++++++++++++++++++++++++++
include/scsi/scsi_transport_iscsi.h | 1 +
2 files changed, 69 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 271afea654e2..ba6cfaf71aef 100644

@@ -2743,11 +2803,19 @@ static int
iscsi_if_destroy_conn(struct iscsi_transport *transport, struct iscsi_uevent *ev)
{
struct iscsi_cls_conn *conn;
+ unsigned long flags;

conn = iscsi_conn_lookup(ev->u.d_conn.sid, ev->u.d_conn.cid);
if (!conn)
return -EINVAL;

+ spin_lock_irqsave(&connlock, flags);
+ if (!list_empty(&conn->conn_list_err)) {
+ spin_unlock_irqrestore(&connlock, flags);
+ return -EAGAIN;
+ }
+ spin_unlock_irqrestore(&connlock, flags);

Khazhismel Kumykov

unread,

Jan 3, 2020, 2:40:34 PM1/3/20

to Gabriel Krisman Bertazi, ldu...@suse.com, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

On Fri, Jan 3, 2020 at 2:26 PM Gabriel Krisman Bertazi
<kri...@collabora.com> wrote:
> Please consider the v4 below with the lock added.
>

Reviewed-by: Khazhismel Kumykov <kha...@google.com>

Martin K. Petersen

unread,

Jan 15, 2020, 10:52:39 PM1/15/20

to ldu...@suse.com, Gabriel Krisman Bertazi, Khazhismel Kumykov, Chris Leech, je...@linux.ibm.com, Martin K. Petersen, 'Khazhismel Kumykov' via open-iscsi, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Mike Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu

> Please consider the v4 below with the lock added.

Lee: Please re-review this given the code change.

> From: Bharath Ravi <rbha...@google.com>
>
> Connection failure processing depends on a daemon being present to (at
> least) stop the connection and start recovery. This is a problem on a
> multipath scenario, where if the daemon failed for whatever reason, the
> SCSI path is never marked as down, multipath won't perform the
> failover and IO to the device will be forever waiting for that
> connection to come back.
>
> This patch performs the connection failure entirely inside the kernel.
> This way, the failover can happen and pending IO can continue even if
> the daemon is dead. Once the daemon comes alive again, it can execute
> recovery procedures if applicable.

--
Martin K. Petersen Oracle Linux Engineering

The Lee-Man

unread,

Jan 23, 2020, 4:56:04 PM1/23/20

to open-iscsi

On Wednesday, January 15, 2020 at 7:52:39 PM UTC-8, Martin K. Petersen wrote:

> Please consider the v4 below with the lock added.

Lee: Please re-review this given the code change.

Martin:

The recent change makes sense, so please still include my:

Reviewed-by: Lee Duncan <ldu...@suse.com>

Gabriel Krisman Bertazi

unread,

Jan 25, 2020, 1:19:38 AM1/25/20

to ldu...@suse.com, martin....@oracle.com, linux...@vger.kernel.org, open-...@googlegroups.com, Bharath Ravi, ker...@collabora.com, Mike Christie, Lee Duncan, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov, Gabriel Krisman Bertazi

From: Bharath Ravi <rbha...@google.com>

Hi Lee,

Martin asked for you to re-review this patch before he applies it, since
there was a small change from v3 after you acked it. The change is that
we started to protect the list_empty() verification with the spin lock
on session destruction.

For that reason, I dropped your reviewed-by. Can you please take
another look so we can have this merged?

Thanks,

-- >8 --

Connection failure processing depends on a daemon being present to (at
least) stop the connection and start recovery. This is a problem on a
multipath scenario, where if the daemon failed for whatever reason, the
SCSI path is never marked as down, multipath won't perform the
failover and IO to the device will be forever waiting for that
connection to come back.

This patch performs the connection failure entirely inside the kernel.
This way, the failover can happen and pending IO can continue even if
the daemon is dead. Once the daemon comes alive again, it can execute
recovery procedures if applicable.

Changes since v3:
- Protect list_empty with connlock on session destroy

Changes since v2:
- Don't hold rx_mutex for too long at once

Changes since v1:
- Remove module parameter.
- Always do kernel-side stop work.
- Block recovery timeout handler if system is dying.
- send a CONN_TERM stop if the system is dying.

Cc: Mike Christie <mchr...@redhat.com>
Cc: Lee Duncan <LDu...@suse.com>
Cc: Bart Van Assche <bvana...@acm.org>

Co-developed-by: Dave Clausen <dcla...@google.com>
Signed-off-by: Dave Clausen <dcla...@google.com>
Co-developed-by: Nick Black <n...@google.com>
Signed-off-by: Nick Black <n...@google.com>
Co-developed-by: Vaibhav Nagarnaik <vnaga...@google.com>
Signed-off-by: Vaibhav Nagarnaik <vnaga...@google.com>
Co-developed-by: Anatol Pomazau <ana...@google.com>
Signed-off-by: Anatol Pomazau <ana...@google.com>
Co-developed-by: Tahsin Erdogan <tah...@google.com>
Signed-off-by: Tahsin Erdogan <tah...@google.com>
Co-developed-by: Frank Mayhar <fma...@google.com>
Signed-off-by: Frank Mayhar <fma...@google.com>
Co-developed-by: Junho Ryu <ja...@google.com>
Signed-off-by: Junho Ryu <ja...@google.com>
Co-developed-by: Khazhismel Kumykov <kha...@google.com>
Signed-off-by: Khazhismel Kumykov <kha...@google.com>

Reviewed-by: Reviewed-by: Khazhismel Kumykov <kha...@google.com>

2.25.0.rc2

Lee Duncan

unread,

Jan 25, 2020, 12:22:08 PM1/25/20

to open-iscsi, Lee Duncan, martin....@oracle.com, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Michael Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov, Gabriel Krisman Bertazi

On Jan 24, 2020, at 10:19 PM, Gabriel Krisman Bertazi <kri...@collabora.com> wrote:

From: Bharath Ravi <rbha...@google.com>

Hi Lee,

Martin asked for you to re-review this patch before he applies it, since
there was a small change from v3 after you acked it. The change is that
we started to protect the list_empty() verification with the spin lock
on session destruction.

For that reason, I dropped your reviewed-by. Can you please take
another look so we can have this merged?

Thanks,

I’m sorry if it didn’t get through, but I sent a Reviewed-by update and the end of last week.

I looked over the updates, and I said that they look good to me, and I said please re-add my:

Reviewed-by: Lee Duncan <ldu...@suse.com>

--
You received this message because you are subscribed to a topic in the Google Groups "open-iscsi" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/open-iscsi/SNd7Di-ZRao/unsubscribe.
To unsubscribe from this group and all its topics, send an email to open-iscsi+...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/open-iscsi/20200125061925.191601-1-krisman%40collabora.com.

Gabriel Krisman Bertazi

unread,

Jan 25, 2020, 12:46:29 PM1/25/20

to Lee Duncan, open-iscsi, Lee Duncan, martin....@oracle.com, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Michael Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Lee Duncan <leeman...@gmail.com> writes:

> I’m sorry if it didn’t get through, but I sent a Reviewed-by update and the end of last week.
>
> I looked over the updates, and I said that they look good to me, and I said please re-add my:
>

> Reviewed-by: Lee Duncan <ldu...@suse.com <mailto:ldu...@suse.com>>

Hey,

Thank you very much for the quick response! I checked here again and I didn't
get the previous email, but I see it made into the ML archive, so my apologies,
it must be something bad on my (or my employer's) setup.

Thanks,

--
Gabriel Krisman Bertazi

Martin K. Petersen

unread,

Jan 27, 2020, 10:20:29 PM1/27/20

to Gabriel Krisman Bertazi, Lee Duncan, open-iscsi, Lee Duncan, martin....@oracle.com, linux...@vger.kernel.org, Bharath Ravi, ker...@collabora.com, Michael Christie, Bart Van Assche, Dave Clausen, Nick Black, Vaibhav Nagarnaik, Anatol Pomazau, Tahsin Erdogan, Frank Mayhar, Junho Ryu, Khazhismel Kumykov

Gabriel,

> Thank you very much for the quick response! I checked here again and
> I didn't get the previous email, but I see it made into the ML
> archive, so my apologies, it must be something bad on my (or my
> employer's) setup.

I didn't get it. Patchwork didn't either.

In any case: Applied to 5.7/scsi-queue. Thanks!

Reply all

Reply to author

Forward