linux/fs/ocfs2/dlmglue.h
Eric Ren 439a36b8ef ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
We are in the situation that we have to avoid recursive cluster locking,
but there is no way to check if a cluster lock has been taken by a precess
already.

Mostly, we can avoid recursive locking by writing code carefully.
However, we found that it's very hard to handle the routines that are
invoked directly by vfs code.  For instance:

  const struct inode_operations ocfs2_file_iops = {
      .permission     = ocfs2_permission,
      .get_acl        = ocfs2_iop_get_acl,
      .set_acl        = ocfs2_iop_set_acl,
  };

Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):

  do_sys_open
   may_open
    inode_permission
     ocfs2_permission
      ocfs2_inode_lock() <=== first time
       generic_permission
        get_acl
         ocfs2_iop_get_acl
  	ocfs2_inode_lock() <=== recursive one

A deadlock will occur if a remote EX request comes in between two of
ocfs2_inode_lock().  Briefly describe how the deadlock is formed:

On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of
the remote EX lock request.  Another hand, the recursive cluster lock
(the second one) will be blocked in in __ocfs2_cluster_lock() because of
OCFS2_LOCK_BLOCKED.  But, the downconvert never complete, why? because
there is no chance for the first cluster lock on this node to be
unlocked - we block ourselves in the code path.

The idea to fix this issue is mostly taken from gfs2 code.

1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track
   of the processes' pid who has taken the cluster lock of this lock
   resource;

2. introduce a new flag for ocfs2_inode_lock_full:
   OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for
   us if we've got cluster lock.

3. export a helper: ocfs2_is_locked_by_me() is used to check if we have
   got the cluster lock in the upper code path.

The tracking logic should be used by some of the ocfs2 vfs's callbacks,
to solve the recursive locking issue cuased by the fact that vfs
routines can call into each other.

The performance penalty of processing the holder list should only be
seen at a few cases where the tracking logic is used, such as get/set
acl.

You may ask what if the first time we got a PR lock, and the second time
we want a EX lock? fortunately, this case never happens in the real
world, as far as I can see, including permission check,
(get|set)_(acl|attr), and the gfs2 code also do so.

[sfr@canb.auug.org.au remove some inlines]
Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-22 16:41:27 -08:00

192 lines
6.3 KiB
C

/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmglue.h
*
* description here
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef DLMGLUE_H
#define DLMGLUE_H
#include "dcache.h"
#define OCFS2_LVB_VERSION 5
struct ocfs2_meta_lvb {
__u8 lvb_version;
__u8 lvb_reserved0;
__be16 lvb_idynfeatures;
__be32 lvb_iclusters;
__be32 lvb_iuid;
__be32 lvb_igid;
__be64 lvb_iatime_packed;
__be64 lvb_ictime_packed;
__be64 lvb_imtime_packed;
__be64 lvb_isize;
__be16 lvb_imode;
__be16 lvb_inlink;
__be32 lvb_iattr;
__be32 lvb_igeneration;
__be32 lvb_reserved2;
};
#define OCFS2_QINFO_LVB_VERSION 1
struct ocfs2_qinfo_lvb {
__u8 lvb_version;
__u8 lvb_reserved[3];
__be32 lvb_bgrace;
__be32 lvb_igrace;
__be32 lvb_syncms;
__be32 lvb_blocks;
__be32 lvb_free_blk;
__be32 lvb_free_entry;
};
#define OCFS2_ORPHAN_LVB_VERSION 1
struct ocfs2_orphan_scan_lvb {
__u8 lvb_version;
__u8 lvb_reserved[3];
__be32 lvb_os_seqno;
};
struct ocfs2_lock_holder {
struct list_head oh_list;
struct pid *oh_owner_pid;
};
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
/* Instruct the dlm not to queue ourselves on the other node. */
#define OCFS2_META_LOCK_NOQUEUE (0x02)
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
#define OCFS2_LOCK_NONBLOCK (0x04)
/* just get back disk inode bh if we've got cluster lock. */
#define OCFS2_META_LOCK_GETBH (0x08)
/* Locking subclasses of inode cluster lock */
enum {
OI_LS_NORMAL = 0,
OI_LS_PARENT,
OI_LS_RENAME1,
OI_LS_RENAME2,
OI_LS_REFLINK_TARGET,
};
int ocfs2_dlm_init(struct ocfs2_super *osb);
void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
unsigned int generation,
struct inode *inode);
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
u64 parent, struct inode *inode);
struct ocfs2_file_private;
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp);
struct ocfs2_mem_dqinfo;
void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_mem_dqinfo *info);
void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_super *osb, u64 ref_blkno,
unsigned int generation);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
int ocfs2_open_lock(struct inode *inode);
int ocfs2_try_open_lock(struct inode *inode, int write);
void ocfs2_open_unlock(struct inode *inode);
int ocfs2_inode_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
int ocfs2_inode_lock_full_nested(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
int arg_flags,
int subclass);
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct page *page);
/* Variants without special locking class or flags */
#define ocfs2_inode_lock_full(i, r, e, f)\
ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
#define ocfs2_inode_lock_nested(i, b, e, s)\
ocfs2_inode_lock_full_nested(i, b, e, 0, s)
/* 99% of the time we don't want to supply any additional flags --
* those are for very specific cases only. */
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
void ocfs2_inode_unlock(struct inode *inode,
int ex);
int ocfs2_super_lock(struct ocfs2_super *osb,
int ex);
void ocfs2_super_unlock(struct ocfs2_super *osb,
int ex);
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
struct ocfs2_refcount_tree;
int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
/* for the downconvert thread */
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
/* To set the locking protocol on module initialization */
void ocfs2_set_locking_protocol(void);
/* The _tracker pair is used to avoid cluster recursive locking */
int ocfs2_inode_lock_tracker(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct ocfs2_lock_holder *oh);
void ocfs2_inode_unlock_tracker(struct inode *inode,
int ex,
struct ocfs2_lock_holder *oh,
int had_lock);
#endif /* DLMGLUE_H */