mirror of
https://github.com/torvalds/linux.git
synced 2024-11-14 16:12:02 +00:00
35a0a409fa
The commitdb5e653d7c
("md: delay choosing sync action to md_start_sync()") delays the start of the sync action. In a clustered environment, this will cause another node to first activate the spare disk and skip recovery. As a result, no nodes will perform recovery when a disk is added or re-added. Beforedb5e653d7c
: ``` node1 node2 ---------------------------------------------------------------- md_check_recovery + md_update_sb | sendmsg: METADATA_UPDATED + md_choose_sync_action process_metadata_update | remove_and_add_spares //node1 has not finished adding + call mddev->sync_work //the spare disk:do nothing md_start_sync starts md_do_sync md_do_sync + grabbed resync_lockres:DLM_LOCK_EX + do syncing job md_check_recovery sendmsg: METADATA_UPDATED process_metadata_update //activate spare disk ... ... md_do_sync waiting to grab resync_lockres:EX ``` Afterdb5e653d7c
: (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery starts md_start_sync, setting the INTR action will exacerbate the delay in node1 calling the md_do_sync function.) ``` node1 node2 ---------------------------------------------------------------- md_check_recovery + md_update_sb | sendmsg: METADATA_UPDATED + calls mddev->sync_work process_metadata_update //node1 has not finished adding //the spare disk:do nothing md_start_sync + md_choose_sync_action | remove_and_add_spares + calls md_do_sync md_check_recovery md_update_sb sendmsg: METADATA_UPDATED process_metadata_update //activate spare disk ... ... ... ... md_do_sync + grabbed resync_lockres:EX + raid1_sync_request skip sync under conf->fullsync:0 md_do_sync 1. waiting to grab resync_lockres:EX 2. when node1 could grab EX lock, node1 will skip resync under recovery_offset:MaxSector ``` How to trigger: ```(commands @node1) # to easily watch the recovery status echo 2000 > /proc/sys/dev/raid/speed_limit_max ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda mdadm --manage /dev/md0 --add /dev/sdc === "cat /proc/mdstat" on both node, there are no recovery action. === ``` How to fix: because md layer code logic is hard to restore for speeding up sync job on local node, we add new cluster msg to pending the another node to active disk. Signed-off-by: Heming Zhao <heming.zhao@suse.com> Reviewed-by: Su Yue <glass.su@suse.com> Acked-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20240709104120.22243-2-heming.zhao@suse.com
39 lines
1.5 KiB
C
39 lines
1.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
|
|
#ifndef _MD_CLUSTER_H
|
|
#define _MD_CLUSTER_H
|
|
|
|
#include "md.h"
|
|
|
|
struct mddev;
|
|
struct md_rdev;
|
|
|
|
struct md_cluster_operations {
|
|
int (*join)(struct mddev *mddev, int nodes);
|
|
int (*leave)(struct mddev *mddev);
|
|
int (*slot_number)(struct mddev *mddev);
|
|
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
|
|
int (*resync_start_notify)(struct mddev *mddev);
|
|
int (*resync_status_get)(struct mddev *mddev);
|
|
void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
|
|
int (*metadata_update_start)(struct mddev *mddev);
|
|
int (*metadata_update_finish)(struct mddev *mddev);
|
|
void (*metadata_update_cancel)(struct mddev *mddev);
|
|
int (*resync_start)(struct mddev *mddev);
|
|
int (*resync_finish)(struct mddev *mddev);
|
|
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
|
|
int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
|
|
void (*add_new_disk_cancel)(struct mddev *mddev);
|
|
int (*new_disk_ack)(struct mddev *mddev, bool ack);
|
|
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
|
|
void (*load_bitmaps)(struct mddev *mddev, int total_slots);
|
|
int (*gather_bitmaps)(struct md_rdev *rdev);
|
|
int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize);
|
|
int (*lock_all_bitmaps)(struct mddev *mddev);
|
|
void (*unlock_all_bitmaps)(struct mddev *mddev);
|
|
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
|
|
};
|
|
|
|
#endif /* _MD_CLUSTER_H */
|