1
0
Fork 0

Merging upstream version 4.2+20230313.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-14 06:05:07 +01:00
parent 0471dd9ac9
commit b8d8e3b147
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
6 changed files with 437 additions and 176 deletions

565
Create.c
View file

@ -26,6 +26,10 @@
#include "md_u.h"
#include "md_p.h"
#include <ctype.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/signalfd.h>
#include <sys/wait.h>
static int round_size_and_verify(unsigned long long *size, int chunk)
{
@ -91,6 +95,382 @@ int default_layout(struct supertype *st, int level, int verbose)
return layout;
}
static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
struct mddev_dev *dv)
{
const unsigned long long req_size = 1 << 30;
unsigned long long offset_bytes, size_bytes, sz;
sigset_t sigset;
int ret = 0;
pid_t pid;
size_bytes = KIB_TO_BYTES(s->size);
/*
* If size_bytes is zero, this is a zoned raid array where
* each disk is of a different size and uses its full
* disk. Thus zero the entire disk.
*/
if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
return -1;
if (dv->data_offset != INVALID_SECTORS)
offset_bytes = SEC_TO_BYTES(dv->data_offset);
else
offset_bytes = SEC_TO_BYTES(st->data_offset);
pr_info("zeroing data from %lld to %lld on: %s\n",
offset_bytes, size_bytes, dv->devname);
pid = fork();
if (pid < 0) {
pr_err("Could not fork to zero disks: %s\n", strerror(errno));
return pid;
} else if (pid != 0) {
return pid;
}
sigemptyset(&sigset);
sigaddset(&sigset, SIGINT);
sigprocmask(SIG_UNBLOCK, &sigset, NULL);
while (size_bytes) {
/*
* Split requests to the kernel into 1GB chunks seeing the
* fallocate() call is not interruptible and blocking a
* ctrl-c for several minutes is not desirable.
*
* 1GB is chosen as a compromise: the user may still have
* to wait several seconds if they ctrl-c on devices that
* zero slowly, but will reduce the number of requests
* required and thus the overhead on devices that perform
* better.
*/
sz = size_bytes;
if (sz >= req_size)
sz = req_size;
if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
offset_bytes, sz)) {
pr_err("zeroing %s failed: %s\n", dv->devname,
strerror(errno));
ret = 1;
break;
}
offset_bytes += sz;
size_bytes -= sz;
}
exit(ret);
}
static int wait_for_zero_forks(int *zero_pids, int count)
{
int wstatus, ret = 0, i, sfd, wait_count = 0;
struct signalfd_siginfo fdsi;
bool interrupted = false;
sigset_t sigset;
ssize_t s;
for (i = 0; i < count; i++)
if (zero_pids[i])
wait_count++;
if (!wait_count)
return 0;
sigemptyset(&sigset);
sigaddset(&sigset, SIGINT);
sigaddset(&sigset, SIGCHLD);
sigprocmask(SIG_BLOCK, &sigset, NULL);
sfd = signalfd(-1, &sigset, 0);
if (sfd < 0) {
pr_err("Unable to create signalfd: %s\n", strerror(errno));
return 1;
}
while (1) {
s = read(sfd, &fdsi, sizeof(fdsi));
if (s != sizeof(fdsi)) {
pr_err("Invalid signalfd read: %s\n", strerror(errno));
close(sfd);
return 1;
}
if (fdsi.ssi_signo == SIGINT) {
printf("\n");
pr_info("Interrupting zeroing processes, please wait...\n");
interrupted = true;
} else if (fdsi.ssi_signo == SIGCHLD) {
if (!--wait_count)
break;
}
}
close(sfd);
for (i = 0; i < count; i++) {
if (!zero_pids[i])
continue;
waitpid(zero_pids[i], &wstatus, 0);
zero_pids[i] = 0;
if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
ret = 1;
}
if (interrupted) {
pr_err("zeroing interrupted!\n");
return 1;
}
if (ret)
pr_err("zeroing failed!\n");
else
pr_info("zeroing finished\n");
return ret;
}
static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
struct supertype *st, struct mddev_dev *dv,
struct mdinfo *info, int have_container, int major_num,
int *zero_pid)
{
dev_t rdev;
int fd;
if (dv->disposition == 'j') {
info->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
info->disk.state = (1<<MD_DISK_JOURNAL);
} else if (info->disk.raid_disk < s->raiddisks) {
info->disk.state = (1<<MD_DISK_ACTIVE) |
(1<<MD_DISK_SYNC);
} else {
info->disk.state = 0;
}
if (dv->writemostly == FlagSet) {
if (major_num == BITMAP_MAJOR_CLUSTERED) {
pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
return 1;
} else {
info->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
}
}
if (dv->failfast == FlagSet)
info->disk.state |= (1<<MD_DISK_FAILFAST);
if (have_container) {
fd = -1;
} else {
if (st->ss->external && st->container_devnm[0])
fd = open(dv->devname, O_RDWR);
else
fd = open(dv->devname, O_RDWR|O_EXCL);
if (fd < 0) {
pr_err("failed to open %s after earlier success - aborting\n",
dv->devname);
return 1;
}
if (!fstat_is_blkdev(fd, dv->devname, &rdev))
return 1;
info->disk.major = major(rdev);
info->disk.minor = minor(rdev);
}
if (fd >= 0)
remove_partitions(fd);
if (st->ss->add_to_super(st, &info->disk, fd, dv->devname,
dv->data_offset)) {
ioctl(mdfd, STOP_ARRAY, NULL);
return 1;
}
st->ss->getinfo_super(st, info, NULL);
if (fd >= 0 && s->write_zeroes) {
*zero_pid = write_zeroes_fork(fd, s, st, dv);
if (*zero_pid <= 0) {
ioctl(mdfd, STOP_ARRAY, NULL);
return 1;
}
}
if (have_container && c->verbose > 0)
pr_err("Using %s for device %d\n",
map_dev(info->disk.major, info->disk.minor, 0),
info->disk.number);
if (!have_container) {
/* getinfo_super might have lost these ... */
info->disk.major = major(rdev);
info->disk.minor = minor(rdev);
}
return 0;
}
static int update_metadata(int mdfd, struct shape *s, struct supertype *st,
struct map_ent **map, struct mdinfo *info,
char *chosen_name)
{
struct mdinfo info_new;
struct map_ent *me = NULL;
/* check to see if the uuid has changed due to these
* metadata changes, and if so update the member array
* and container uuid. Note ->write_init_super clears
* the subarray cursor such that ->getinfo_super once
* again returns container info.
*/
st->ss->getinfo_super(st, &info_new, NULL);
if (st->ss->external && is_container(s->level) &&
!same_uuid(info_new.uuid, info->uuid, 0)) {
map_update(map, fd2devnm(mdfd),
info_new.text_version,
info_new.uuid, chosen_name);
me = map_by_devnm(map, st->container_devnm);
}
if (st->ss->write_init_super(st)) {
st->ss->free_super(st);
return 1;
}
/*
* Before activating the array, perform extra steps
* required to configure the internal write-intent
* bitmap.
*/
if (info_new.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
st->ss->set_bitmap && st->ss->set_bitmap(st, info)) {
st->ss->free_super(st);
return 1;
}
/* update parent container uuid */
if (me) {
char *path = xstrdup(me->path);
st->ss->getinfo_super(st, &info_new, NULL);
map_update(map, st->container_devnm, info_new.text_version,
info_new.uuid, path);
free(path);
}
flush_metadata_updates(st);
st->ss->free_super(st);
return 0;
}
static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
struct context *c, struct supertype *st,
struct map_ent **map, struct mddev_dev *devlist,
int total_slots, int have_container, int insert_point,
int major_num, char *chosen_name)
{
struct mddev_dev *moved_disk = NULL;
int pass, raid_disk_num, dnum;
int zero_pids[total_slots];
struct mddev_dev *dv;
struct mdinfo *infos;
sigset_t sigset, orig_sigset;
int ret = 0;
/*
* Block SIGINT so the main thread will always wait for the
* zeroing processes when being interrupted. Otherwise the
* zeroing processes will finish their work in the background
* keeping the disk busy.
*/
sigemptyset(&sigset);
sigaddset(&sigset, SIGINT);
sigprocmask(SIG_BLOCK, &sigset, &orig_sigset);
memset(zero_pids, 0, sizeof(zero_pids));
infos = xmalloc(sizeof(*infos) * total_slots);
enable_fds(total_slots);
for (pass = 1; pass <= 2; pass++) {
for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
if (dnum >= total_slots)
abort();
if (dnum == insert_point) {
raid_disk_num += 1;
moved_disk = dv;
continue;
}
if (strcasecmp(dv->devname, "missing") == 0) {
raid_disk_num += 1;
continue;
}
if (have_container)
moved_disk = NULL;
if (have_container && dnum < total_slots - 1)
/* repeatedly use the container */
moved_disk = dv;
switch(pass) {
case 1:
infos[dnum] = *info;
infos[dnum].disk.number = dnum;
infos[dnum].disk.raid_disk = raid_disk_num++;
if (dv->disposition == 'j')
raid_disk_num--;
ret = add_disk_to_super(mdfd, s, c, st, dv,
&infos[dnum], have_container,
major_num, &zero_pids[dnum]);
if (ret)
goto out;
break;
case 2:
infos[dnum].errors = 0;
ret = add_disk(mdfd, st, info, &infos[dnum]);
if (ret) {
pr_err("ADD_NEW_DISK for %s failed: %s\n",
dv->devname, strerror(errno));
if (errno == EINVAL &&
info->array.level == 0) {
pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
pr_err("Either upgrade, or use --layout=dangerous\n");
}
goto out;
}
break;
}
if (!have_container &&
dv == moved_disk && dnum != insert_point) break;
}
if (pass == 1) {
ret = wait_for_zero_forks(zero_pids, total_slots);
if (ret)
goto out;
ret = update_metadata(mdfd, s, st, map, info,
chosen_name);
if (ret)
goto out;
}
}
out:
if (ret)
wait_for_zero_forks(zero_pids, total_slots);
free(infos);
sigprocmask(SIG_SETMASK, &orig_sigset, NULL);
return ret;
}
int Create(struct supertype *st, char *mddev,
char *name, int *uuid,
int subdevs, struct mddev_dev *devlist,
@ -117,7 +497,7 @@ int Create(struct supertype *st, char *mddev,
unsigned long long minsize = 0, maxsize = 0;
char *mindisc = NULL;
char *maxdisc = NULL;
int dnum, raid_disk_num;
int dnum;
struct mddev_dev *dv;
dev_t rdev;
int fail = 0, warn = 0;
@ -126,18 +506,16 @@ int Create(struct supertype *st, char *mddev,
int missing_disks = 0;
int insert_point = subdevs * 2; /* where to insert a missing drive */
int total_slots;
int pass;
int rv;
int bitmap_fd;
int have_container = 0;
int container_fd = -1;
int need_mdmon = 0;
unsigned long long bitmapsize;
struct mdinfo info, *infos;
struct mdinfo info;
int did_default = 0;
int do_default_layout = 0;
int do_default_chunk = 0;
unsigned long safe_mode_delay = 0;
char chosen_name[1024];
struct map_ent *map = NULL;
unsigned long long newsize;
@ -778,11 +1156,12 @@ int Create(struct supertype *st, char *mddev,
mdi = sysfs_read(-1, devnm, GET_VERSION);
pr_err("Creating array inside %s container %s\n",
pr_info("Creating array inside %s container %s\n",
mdi?mdi->text_version:"managed", devnm);
sysfs_free(mdi);
} else
pr_err("Defaulting to version %s metadata\n", info.text_version);
pr_info("Defaulting to version %s metadata\n",
info.text_version);
}
map_update(&map, fd2devnm(mdfd), info.text_version,
@ -870,175 +1249,11 @@ int Create(struct supertype *st, char *mddev,
}
}
infos = xmalloc(sizeof(*infos) * total_slots);
enable_fds(total_slots);
for (pass = 1; pass <= 2; pass++) {
struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
if (add_disks(mdfd, &info, s, c, st, &map, devlist, total_slots,
have_container, insert_point, major_num, chosen_name))
goto abort_locked;
for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
int fd;
struct mdinfo *inf = &infos[dnum];
if (dnum >= total_slots)
abort();
if (dnum == insert_point) {
raid_disk_num += 1;
moved_disk = dv;
continue;
}
if (strcasecmp(dv->devname, "missing") == 0) {
raid_disk_num += 1;
continue;
}
if (have_container)
moved_disk = NULL;
if (have_container && dnum < info.array.raid_disks - 1)
/* repeatedly use the container */
moved_disk = dv;
switch(pass) {
case 1:
*inf = info;
inf->disk.number = dnum;
inf->disk.raid_disk = raid_disk_num++;
if (dv->disposition == 'j') {
inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
inf->disk.state = (1<<MD_DISK_JOURNAL);
raid_disk_num--;
} else if (inf->disk.raid_disk < s->raiddisks)
inf->disk.state = (1<<MD_DISK_ACTIVE) |
(1<<MD_DISK_SYNC);
else
inf->disk.state = 0;
if (dv->writemostly == FlagSet) {
if (major_num == BITMAP_MAJOR_CLUSTERED) {
pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
goto abort_locked;
} else
inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
}
if (dv->failfast == FlagSet)
inf->disk.state |= (1<<MD_DISK_FAILFAST);
if (have_container)
fd = -1;
else {
if (st->ss->external &&
st->container_devnm[0])
fd = open(dv->devname, O_RDWR);
else
fd = open(dv->devname, O_RDWR|O_EXCL);
if (fd < 0) {
pr_err("failed to open %s after earlier success - aborting\n",
dv->devname);
goto abort_locked;
}
if (!fstat_is_blkdev(fd, dv->devname, &rdev))
return 1;
inf->disk.major = major(rdev);
inf->disk.minor = minor(rdev);
}
if (fd >= 0)
remove_partitions(fd);
if (st->ss->add_to_super(st, &inf->disk,
fd, dv->devname,
dv->data_offset)) {
ioctl(mdfd, STOP_ARRAY, NULL);
goto abort_locked;
}
st->ss->getinfo_super(st, inf, NULL);
safe_mode_delay = inf->safe_mode_delay;
if (have_container && c->verbose > 0)
pr_err("Using %s for device %d\n",
map_dev(inf->disk.major,
inf->disk.minor,
0), dnum);
if (!have_container) {
/* getinfo_super might have lost these ... */
inf->disk.major = major(rdev);
inf->disk.minor = minor(rdev);
}
break;
case 2:
inf->errors = 0;
rv = add_disk(mdfd, st, &info, inf);
if (rv) {
pr_err("ADD_NEW_DISK for %s failed: %s\n",
dv->devname, strerror(errno));
if (errno == EINVAL &&
info.array.level == 0) {
pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
pr_err("Either upgrade, or use --layout=dangerous\n");
}
goto abort_locked;
}
break;
}
if (!have_container &&
dv == moved_disk && dnum != insert_point) break;
}
if (pass == 1) {
struct mdinfo info_new;
struct map_ent *me = NULL;
/* check to see if the uuid has changed due to these
* metadata changes, and if so update the member array
* and container uuid. Note ->write_init_super clears
* the subarray cursor such that ->getinfo_super once
* again returns container info.
*/
st->ss->getinfo_super(st, &info_new, NULL);
if (st->ss->external && !is_container(s->level) &&
!same_uuid(info_new.uuid, info.uuid, 0)) {
map_update(&map, fd2devnm(mdfd),
info_new.text_version,
info_new.uuid, chosen_name);
me = map_by_devnm(&map, st->container_devnm);
}
if (st->ss->write_init_super(st)) {
st->ss->free_super(st);
goto abort_locked;
}
/*
* Before activating the array, perform extra steps
* required to configure the internal write-intent
* bitmap.
*/
if (info_new.consistency_policy ==
CONSISTENCY_POLICY_BITMAP &&
st->ss->set_bitmap &&
st->ss->set_bitmap(st, &info)) {
st->ss->free_super(st);
goto abort_locked;
}
/* update parent container uuid */
if (me) {
char *path = xstrdup(me->path);
st->ss->getinfo_super(st, &info_new, NULL);
map_update(&map, st->container_devnm,
info_new.text_version,
info_new.uuid, path);
free(path);
}
flush_metadata_updates(st);
st->ss->free_super(st);
}
}
map_unlock(&map);
free(infos);
if (is_container(s->level)) {
/* No need to start. But we should signal udev to
@ -1065,7 +1280,7 @@ int Create(struct supertype *st, char *mddev,
"readonly");
break;
}
sysfs_set_safemode(&info, safe_mode_delay);
sysfs_set_safemode(&info, info.safe_mode_delay);
if (err) {
pr_err("failed to activate array.\n");
ioctl(mdfd, STOP_ARRAY, NULL);
@ -1103,7 +1318,7 @@ int Create(struct supertype *st, char *mddev,
ioctl(mdfd, RESTART_ARRAY_RW, NULL);
}
if (c->verbose >= 0)
pr_err("array %s started.\n", mddev);
pr_info("array %s started.\n", mddev);
if (st->ss->external && st->container_devnm[0]) {
if (need_mdmon)
start_mdmon(st->container_devnm);

View file

@ -138,6 +138,7 @@ struct option long_options[] = {
{"size", 1, 0, 'z'},
{"auto", 1, 0, Auto}, /* also for --assemble */
{"assume-clean",0,0, AssumeClean },
{"write-zeroes",0,0, WriteZeroes },
{"metadata", 1, 0, 'e'}, /* superblock format */
{"bitmap", 1, 0, Bitmap},
{"bitmap-chunk", 1, 0, BitmapChunk},
@ -390,6 +391,7 @@ char Help_create[] =
" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
" --consistency-policy= : Specify the policy that determines how the array\n"
" -k : maintains consistency in case of unexpected shutdown.\n"
" --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n"
"\n"
;

View file

@ -837,6 +837,22 @@ array is resynced at creation. From Linux version 3.0,
.B \-\-assume\-clean
can be used with that command to avoid the automatic resync.
.TP
.BR \-\-write-zeroes
When creating an array, send write zeroes requests to all the block
devices. This should zero the data area on all disks such that the
initial sync is not necessary and, if successfull, will behave
as if
.B \-\-assume\-clean
was specified.
.IP
This is intended for use with devices that have hardware offload for
zeroing, but despite this zeroing can still take several minutes for
large disks. Thus a message is printed before and after zeroing and
each disk is zeroed in parallel with the others.
.IP
This is only meaningful with --create.
.TP
.BR \-\-backup\-file=
This is needed when
@ -1370,7 +1386,7 @@ and
.B layout\-alternate
options are for RAID0 arrays with non-uniform devices size that were in
use before Linux 5.4. If the array was being used with Linux 3.13 or
earlier, then to assemble the array on a new kernel,
earlier, then to assemble the array on a new kernel,
.B \-\-update=layout\-original
must be given. If the array was created and used with a kernel from Linux 3.14 to
Linux 5.3, then

View file

@ -590,6 +590,10 @@ int main(int argc, char *argv[])
s.assume_clean = 1;
continue;
case O(CREATE, WriteZeroes):
s.write_zeroes = 1;
continue;
case O(GROW,'n'):
case O(CREATE,'n'):
case O(BUILD,'n'): /* number of raid disks */
@ -1251,6 +1255,11 @@ int main(int argc, char *argv[])
}
}
if (s.write_zeroes && !s.assume_clean) {
pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
s.assume_clean = 1;
}
if (!mode && devs_found) {
mode = MISC;
devmode = 'Q';

View file

@ -275,6 +275,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
#define KIB_TO_BYTES(x) ((x) << 10)
#define SEC_TO_BYTES(x) ((x) << 9)
extern const char Name[];
struct md_bb_entry {
@ -435,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[],
*/
enum special_options {
AssumeClean = 300,
WriteZeroes,
BitmapChunk,
WriteBehind,
ReAdd,
@ -640,6 +644,7 @@ struct shape {
int bitmap_chunk;
char *bitmap_file;
int assume_clean;
bool write_zeroes;
int write_behind;
unsigned long long size;
unsigned long long data_offset;
@ -1854,6 +1859,8 @@ static inline int xasprintf(char **strp, const char *fmt, ...) {
#endif
#define cont_err(fmt ...) fprintf(stderr, " " fmt)
#define pr_info(fmt, args...) printf("%s: "fmt, Name, ##args)
void *xmalloc(size_t len);
void *xrealloc(void *ptr, size_t len);
void *xcalloc(size_t num, size_t size);

12
tests/00raid5-zero Normal file
View file

@ -0,0 +1,12 @@
if mdadm -CfR $md0 -l 5 -n3 $dev0 $dev1 $dev2 --write-zeroes ; then
check nosync
echo check > /sys/block/md0/md/sync_action;
check wait
elif grep "zeroing [^ ]* failed: Operation not supported" \
$targetdir/stderr; then
echo "write-zeros not supported, skipping"
else
echo >&2 "ERROR: mdadm return failure without not supported message"
exit 1
fi