ocfs2/dlm: fix a race between purge and migration
authorXue jiufei <xuejiufei@huawei.com>
Thu, 14 Jan 2016 23:17:18 +0000 (15:17 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 15 Jan 2016 00:00:49 +0000 (16:00 -0800)
We found a race between purge and migration when doing code review.
Node A put lockres to purgelist before receiving the migrate message
from node B which is the master.  Node A call dlm_mig_lockres_handler to
handle this message.

dlm_mig_lockres_handler
  dlm_lookup_lockres
  >>>>>> race window, dlm_run_purge_list may run and send
         deref message to master, waiting the response
  spin_lock(&res->spinlock);
  res->state |= DLM_LOCK_RES_MIGRATING;
  spin_unlock(&res->spinlock);
  dlm_mig_lockres_handler returns

  >>>>>> dlm_thread receives the response from master for the deref
  message and triggers the BUG because the lockres has the state
  DLM_LOCK_RES_MIGRATING with the following message:

dlm_purge_lockres:209 ERROR: 6633EB681FA7474A9C280A4E1A836F0F: res
M0000000000000000030c0300000000 in use after deref

Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/ocfs2/dlm/dlmrecovery.c

index 9e4f862d20fe1749dc11f6d023793fccbed0069a..86fb53614bf4faed5955a754f033c5e0cdbf1e2f 100644 (file)
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
        char *buf = NULL;
        struct dlm_work_item *item = NULL;
        struct dlm_lock_resource *res = NULL;
+       unsigned int hash;
 
        if (!dlm_grab(dlm))
                return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
        /* lookup the lock to see if we have a secondary queue for this
         * already...  just add the locks in and this will have its owner
         * and RECOVERY flag changed when it completes. */
-       res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+       hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+       spin_lock(&dlm->spinlock);
+       res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+                       hash);
        if (res) {
                /* this will get a ref on res */
                /* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
                                     mres->lockname_len, mres->lockname);
                                ret = -EFAULT;
                                spin_unlock(&res->spinlock);
+                               spin_unlock(&dlm->spinlock);
                                dlm_lockres_put(res);
                                goto leave;
                        }
                        res->state |= DLM_LOCK_RES_MIGRATING;
                }
                spin_unlock(&res->spinlock);
+               spin_unlock(&dlm->spinlock);
        } else {
+               spin_unlock(&dlm->spinlock);
                /* need to allocate, just like if it was
                 * mastered here normally  */
                res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);