/*
 * mdadm - manage Linux "md" devices aka RAID arrays.
 *
 * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
 *
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *    Author: Neil Brown
 *    Email: <neilb@suse.de>
 */

#include "mdadm.h"
#include "md_u.h"
#include "md_p.h"
#include "udev.h"
#include "xmalloc.h"

#include <ctype.h>

int Manage_ro(char *devname, int fd, int readonly)
{
	/* switch to readonly or rw
	 *
	 * requires >= 0.90.0
	 * first check that array is runing
	 * use RESTART_ARRAY_RW or STOP_ARRAY_RO
	 *
	 */
	struct mdinfo *mdi;
	int rv = 0;

	/* If this is an externally-managed array, we need to modify the
	 * metadata_version so that mdmon doesn't undo our change.
	 */
	mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
	if (mdi &&
	    mdi->array.major_version == -1 &&
	    is_subarray(mdi->text_version)) {
		char vers[64];
		strcpy(vers, "external:");
		strcat(vers, mdi->text_version);
		if (readonly > 0) {
			int rv;
			/* We set readonly ourselves. */
			vers[9] = '-';
			sysfs_set_str(mdi, NULL, "metadata_version", vers);

			close_fd(&fd);
			rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");

			if (rv < 0) {
				pr_err("failed to set readonly for %s: %s\n",
					devname, strerror(errno));

				vers[9] = mdi->text_version[0];
				sysfs_set_str(mdi, NULL, "metadata_version", vers);
				rv = 1;
				goto out;
			}
		} else {
			char *cp;
			/* We cannot set read/write - must signal mdmon */
			vers[9] = '/';
			sysfs_set_str(mdi, NULL, "metadata_version", vers);

			cp = strchr(vers+10, '/');
			if (cp)
				*cp = 0;
			ping_monitor(vers+10);
			if (mdi->array.level <= 0)
				sysfs_set_str(mdi, NULL, "array_state", "active");
		}
		goto out;
	}

	if (!md_array_active(fd)) {
		pr_err("%s does not appear to be active.\n", devname);
		rv = 1;
		goto out;
	}

	if (readonly > 0) {
		if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
			pr_err("failed to set readonly for %s: %s\n",
				devname, strerror(errno));
			rv = 1;
			goto out;
		}
	} else if (readonly < 0) {
		if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
			pr_err("failed to set writable for %s: %s\n",
				devname, strerror(errno));
			rv = 1;
			goto out;
		}
	}
out:
	sysfs_free(mdi);
	return rv;
}

static void remove_devices(char *devnm, char *path)
{
	/*
	 * Remove names at 'path' - possibly with
	 * partition suffixes - which link to the 'standard'
	 * name for devnm.  These were probably created
	 * by mdadm when the array was assembled.
	 */
	char base[40];
	char *path2;
	char link[1024];
	int n;
	int part;
	char *be;
	char *pe;

	if (!path)
		return;

	sprintf(base, "/dev/%s", devnm);
	be = base + strlen(base);

	path2 = xmalloc(strlen(path)+20);
	strcpy(path2, path);
	pe = path2 + strlen(path2);

	for (part = 0; part < 16; part++) {
		if (part) {
			sprintf(be, "p%d", part);

			if (isdigit(pe[-1]))
				sprintf(pe, "p%d", part);
			else
				sprintf(pe, "%d", part);
		}
		n = readlink(path2, link, sizeof(link));
		if (n > 0 && (int)strlen(base) == n &&
		    strncmp(link, base, n) == 0)
			unlink(path2);
	}
	free(path2);
}

int Manage_run(char *devname, int fd, struct context *c)
{
	/* Run the array.  Array must already be configured
	 *  Requires >= 0.90.0
	 */
	char nm[32], *nmp;

	nmp = fd2devnm(fd);
	if (!nmp) {
		pr_err("Cannot find %s in sysfs!!\n", devname);
		return 1;
	}
	snprintf(nm, sizeof(nm), "%s", nmp);
	return IncrementalScan(c, nm);
}

int Manage_stop(char *devname, int fd, int verbose, int will_retry)
{
	/* Stop the array.  Array must already be configured
	 * 'will_retry' means that error messages are not wanted.
	 */
	int rv = 0;
	struct map_ent *map = NULL;
	struct mdinfo *mdi;
	char devnm[32];
	char container[MD_NAME_MAX] = {0};
	int err;
	int count;
	char buf[SYSFS_MAX_BUF_SIZE];
	unsigned long long rd1, rd2;

	if (will_retry && verbose == 0)
		verbose = -1;

	snprintf(devnm, sizeof(devnm), "%s", fd2devnm(fd));
	/* Get EXCL access first.  If this fails, then attempting
	 * to stop is probably a bad idea.
	 */
	mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
	if (mdi && is_subarray(mdi->text_version))
		sysfs_get_container_devnm(mdi, container);

	close_fd(&fd);
	count = 5;
	while (((fd = ((devname[0] == '/')
		       ?open(devname, O_RDONLY|O_EXCL)
		       :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 ||
		strcmp(fd2devnm(fd), devnm) != 0) && container[0] &&
	       mdmon_running(container) && count) {
		/* Can't open, so something might be wrong.  However it
		 * is a container, so we might be racing with mdmon, so
		 * retry for a bit.
		 */
		close_fd(&fd);
		flush_mdmon(container);
		count--;
	}
	if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
		close_fd(&fd);
		if (verbose >= 0)
			pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
			       devname);
		sysfs_free(mdi);
		return 1;
	}
	/* If this is an mdmon managed array, just write 'inactive'
	 * to the array state and let mdmon clear up.
	 */
	if (mdi &&
	    mdi->array.level > 0 &&
	    is_subarray(mdi->text_version)) {
		int err;
		/* This is mdmon managed. */
		close_fd(&fd);

		/* As we had an O_EXCL open, any use of the device
		 * which blocks STOP_ARRAY is probably a transient use,
		 * so it is reasonable to retry for a while - 5 seconds.
		 */
		count = 25;
		while (count &&
		       (err = sysfs_set_str(mdi, NULL,
					    "array_state",
					    "inactive")) < 0 &&
		       errno == EBUSY) {
			err = errno;
			sleep_for(0, MSEC_TO_NSEC(200), true);
			count--;
		}
		if (err) {
			if (verbose >= 0)
				pr_err("failed to stop array %s: %s\n",
				       devname, strerror(err));
			rv = 1;
			goto out;
		}

		/* Give monitor a chance to act */
		ping_monitor(mdi->text_version);

		fd = open_dev_excl(devnm);
		if (fd < 0) {
			if (verbose >= 0)
				pr_err("failed to completely stop %s: Device is busy\n",
				       devname);
			rv = 1;
			goto out;
		}
	} else if (mdi &&
		   mdi->array.major_version == -1 &&
		   mdi->array.minor_version == -2 &&
		   !is_subarray(mdi->text_version)) {
		struct mdstat_ent *mds, *m;
		/* container, possibly mdmon-managed.
		 * Make sure mdmon isn't opening it, which
		 * would interfere with the 'stop'
		 */
		ping_monitor(mdi->sys_name);

		/* now check that there are no existing arrays
		 * which are members of this array
		 */
		mds = mdstat_read(0, 0);
		for (m = mds; m; m = m->next)
			if (is_mdstat_ent_external(m) &&
			    metadata_container_matches(m->metadata_version + 9, devnm)) {
				if (verbose >= 0)
					pr_err("Cannot stop container %s: member %s still active\n",
					       devname, m->devnm);
				free_mdstat(mds);
				rv = 1;
				goto out;
			}
	}

	/* If the array is undergoing a reshape which changes the number
	 * of devices, then it would be nice to stop it at a point where
	 * it has completed a full number of stripes in both old and
	 * new layouts as this will allow the reshape to be reverted.
	 * So if 'sync_action' is "reshape" and 'raid_disks' shows two
	 * different numbers, then
	 *  - freeze reshape
	 *  - set sync_max to next multiple of both data_disks and
	 *    chunk sizes (or next but one)
	 *  - unfreeze reshape
	 *  - wait on 'sync_completed' for that point to be reached.
	 */
	if (mdi && is_level456(mdi->array.level) &&
	    sysfs_attribute_available(mdi, NULL, "sync_action") &&
	    sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
	    sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)) > 0 &&
	    strcmp(buf, "reshape\n") == 0 &&
	    sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
		unsigned long long position, curr;
		unsigned long long chunk1, chunk2;
		unsigned long long rddiv, chunkdiv;
		unsigned long long sectors;
		unsigned long long sync_max, old_sync_max;
		unsigned long long completed;
		int backwards = 0;
		int delay;
		int scfd;

		delay = 40;
		while (rd1 > rd2 && delay > 0 &&
		       sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
			/* must be in the critical section - wait a bit */
			delay -= 1;
			sleep_for(0, MSEC_TO_NSEC(100), true);
		}

		if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
			goto done;
		/* Array is frozen */

		rd1 -= mdi->array.level == 6 ? 2 : 1;
		rd2 -= mdi->array.level == 6 ? 2 : 1;
		sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
		if (strncmp(buf, "back", 4) == 0)
			backwards = 1;
		if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
			/* reshape must have finished now */
			sysfs_set_str(mdi, NULL, "sync_action", "idle");
			goto done;
		}
		sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
		chunk1 /= 512;
		chunk2 /= 512;
		rddiv = GCD(rd1, rd2);
		chunkdiv = GCD(chunk1, chunk2);
		sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;

		if (backwards) {
			/* Need to subtract 'reshape_position' from
			 * array size to get equivalent of sync_max.
			 * Size calculation based on raid5_size in kernel.
			 */
			unsigned long long size = mdi->component_size;
			size &= ~(chunk1-1);
			size &= ~(chunk2-1);
			/* rd1 must be smaller */
			/* Reshape may have progressed further backwards than
			 * recorded, so target even further back (hence "-1")
			 */
			position = (position / sectors - 1) * sectors;
			/* rd1 is always the conversion factor between 'sync'
			 * position and 'reshape' position.
			 * We read 1 "new" stripe worth of data from where-ever,
			 * and when write out that full stripe.
			 */
			sync_max = size - position/rd1;
		} else {
			/* Reshape will very likely be beyond position, and it may
			 * be too late to stop at '+1', so aim for '+2'
			 */
			position = (position / sectors + 2) * sectors;
			sync_max = position/rd1;
		}
		if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
			old_sync_max = mdi->component_size;
		/* Must not advance sync_max as that could confuse
		 * the reshape monitor */
		if (sync_max < old_sync_max)
			sysfs_set_num(mdi, NULL, "sync_max", sync_max);
		sysfs_set_str(mdi, NULL, "sync_action", "idle");

		/* That should have set things going again.  Now we
		 * wait a little while (3 second max) for sync_completed
		 * to reach the target.
		 * The reshape process can block for 500msec if
		 * the sync speed limit is hit, so we need to wait
		 * a lot longer than that. 1 second is usually
		 * enough.  3 is safe.
		 */
		delay = 3000;
		scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
		while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
			unsigned long long max_completed;
			sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
			sysfs_fd_get_str(scfd, buf, sizeof(buf));
			if (str_is_none(buf) == true) {
				/* Either reshape has aborted, or hasn't
				 * quite started yet.  Wait a bit and
				 * check  'sync_action' to see.
				 */
				sleep_for(0, MSEC_TO_NSEC(10), true);
				sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
				if (strncmp(buf, "reshape", 7) != 0)
					break;
			}

			if (sysfs_fd_get_two(scfd, &completed,
					     &max_completed) == 2 &&
			    /* 'completed' sometimes reads as max-uulong */
			    completed < max_completed &&
			    (completed > sync_max ||
			     (completed == sync_max && curr != position))) {
				while (completed > sync_max) {
					sync_max += sectors / rd1;
					if (backwards)
						position -= sectors;
					else
						position += sectors;
				}
				if (sync_max < old_sync_max)
					sysfs_set_num(mdi, NULL, "sync_max", sync_max);
			}

			if (!backwards && curr >= position)
				break;
			if (backwards && curr <= position)
				break;
			sysfs_wait(scfd, &delay);
		}
		close_fd(&scfd);

	}
done:

	/* As we have an O_EXCL open, any use of the device
	 * which blocks STOP_ARRAY is probably a transient use,
	 * so it is reasonable to retry for a while - 5 seconds.
	 */
	count = 25; err = 0;
	while (count && fd >= 0 &&
	       (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) {
		err = errno;
		sleep_for(0, MSEC_TO_NSEC(200), true);
		count --;
	}
	if (fd >= 0 && err) {
		if (verbose >= 0) {
			pr_err("failed to stop array %s: %s\n",
			       devname, strerror(err));
			if (err == EBUSY)
				cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
		}
		rv = 1;
		goto out;
	}

	if (devnm[0] && udev_is_available()) {
		struct map_ent *mp = map_by_devnm(&map, devnm);
		remove_devices(devnm, mp ? mp->path : NULL);
	}

	if (verbose >= 0)
		pr_info("stopped %s\n", devname);
	map_lock(&map);
	map_remove(&map, devnm);
	map_unlock(&map);
out:
	sysfs_free(mdi);
	close_fd(&fd);

	return rv;
}

static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
{
	struct mddev_dev *new;
	new = xmalloc(sizeof(*new));
	memset(new, 0, sizeof(*new));
	new->devname = xstrdup(name);
	new->disposition = disp;
	new->next = dv->next;
	dv->next = new;
	return new;
}

static void add_faulty(struct mddev_dev *dv, int fd, char disp)
{
	mdu_array_info_t array;
	mdu_disk_info_t disk;
	int remaining_disks;
	int i;

	if (md_get_array_info(fd, &array) != 0)
		return;

	remaining_disks = array.nr_disks;
	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
		char buf[40];
		disk.number = i;
		if (md_get_disk_info(fd, &disk) != 0)
			continue;
		if (disk.major == 0 && disk.minor == 0)
			continue;
		remaining_disks--;
		if ((disk.state & 1) == 0) /* not faulty */
			continue;
		sprintf(buf, "%d:%d", disk.major, disk.minor);
		dv = add_one(dv, buf, disp);
	}
}

static void add_detached(struct mddev_dev *dv, int fd, char disp)
{
	mdu_array_info_t array;
	mdu_disk_info_t disk;
	int remaining_disks;
	int i;

	if (md_get_array_info(fd, &array) != 0)
		return;

	remaining_disks = array.nr_disks;
	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
		char buf[40];
		int sfd;
		disk.number = i;
		if (md_get_disk_info(fd, &disk) != 0)
			continue;
		if (disk.major == 0 && disk.minor == 0)
			continue;
		remaining_disks--;
		if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
			continue;
		sprintf(buf, "%d:%d", disk.major, disk.minor);
		sfd = dev_open(buf, O_RDONLY);
		if (sfd >= 0) {
			/* Not detached */
			close(sfd);
			continue;
		}
		if (errno != ENXIO)
			/* Probably not detached */
			continue;
		dv = add_one(dv, buf, disp);
	}
}

static void add_set(struct mddev_dev *dv, int fd, char set_char)
{
	mdu_array_info_t array;
	mdu_disk_info_t disk;
	int remaining_disks;
	int copies, set;
	int i;

	if (md_get_array_info(fd, &array) != 0)
		return;
	if (array.level != 10)
		return;
	copies = ((array.layout & 0xff) *
		  ((array.layout >> 8) & 0xff));
	if (array.raid_disks % copies)
		return;

	remaining_disks = array.nr_disks;
	for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
		char buf[40];
		disk.number = i;
		if (md_get_disk_info(fd, &disk) != 0)
			continue;
		if (disk.major == 0 && disk.minor == 0)
			continue;
		remaining_disks--;
		set = disk.raid_disk % copies;
		if (set_char != set + 'A')
			continue;
		sprintf(buf, "%d:%d", disk.major, disk.minor);
		dv = add_one(dv, buf, dv->disposition);
	}
}

int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
		   struct supertype *dev_st, struct supertype *tst,
		   unsigned long rdev, enum update_opt update,
		   char *devname, int verbose, mdu_array_info_t *array)
{
	struct mdinfo mdi;
	int duuid[4];
	int ouuid[4];

	dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
	dev_st->ss->uuid_from_super(dev_st, ouuid);
	if (tst->sb)
		tst->ss->uuid_from_super(tst, duuid);
	else
		/* Assume uuid matches: kernel will check */
		memcpy(duuid, ouuid, sizeof(ouuid));
	if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
	    !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
	    memcmp(duuid, ouuid, sizeof(ouuid))==0) {
		/* Looks like it is worth a
		 * try.  Need to make sure
		 * kernel will accept it
		 * though.
		 */
		mdu_disk_info_t disc;
		disc.number = mdi.disk.number;
		if (md_get_disk_info(fd, &disc) != 0 ||
		    disc.major != 0 || disc.minor != 0)
			goto skip_re_add;
		disc.major = major(rdev);
		disc.minor = minor(rdev);
		disc.number = mdi.disk.number;
		disc.raid_disk = mdi.disk.raid_disk;
		disc.state = mdi.disk.state;
		if (array->state & (1 << MD_SB_CLUSTERED)) {
			/* extra flags are needed when adding to a cluster as
			 * there are two cases to distinguish
			 */
			if (dv->disposition == 'c')
				disc.state |= (1 << MD_DISK_CANDIDATE);
			else
				disc.state |= (1 << MD_DISK_CLUSTER_ADD);
		}
		if (dv->writemostly == FlagSet)
			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
		if (dv->writemostly == FlagClear)
			disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
		if (dv->failfast == FlagSet)
			disc.state |= 1 << MD_DISK_FAILFAST;
		if (dv->failfast == FlagClear)
			disc.state &= ~(1 << MD_DISK_FAILFAST);
		remove_partitions(tfd);
		if (update || dv->writemostly != FlagDefault ||
		    dv->failfast != FlagDefault) {
			int rv = -1;
			tfd = dev_open(dv->devname, O_RDWR);
			if (tfd < 0) {
				pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
				return -1;
			}

			if (dv->writemostly == FlagSet)
				rv = dev_st->ss->update_super(
					dev_st, NULL, UOPT_SPEC_WRITEMOSTLY,
					devname, verbose, 0, NULL);
			if (dv->writemostly == FlagClear)
				rv = dev_st->ss->update_super(
					dev_st, NULL, UOPT_SPEC_READWRITE,
					devname, verbose, 0, NULL);
			if (dv->failfast == FlagSet)
				rv = dev_st->ss->update_super(
					dev_st, NULL, UOPT_SPEC_FAILFAST,
					devname, verbose, 0, NULL);
			if (dv->failfast == FlagClear)
				rv = dev_st->ss->update_super(
					dev_st, NULL, UOPT_SPEC_NOFAILFAST,
					devname, verbose, 0, NULL);
			if (update)
				rv = dev_st->ss->update_super(
					dev_st, NULL, update,
					devname, verbose, 0, NULL);
			if (rv == 0)
				rv = dev_st->ss->store_super(dev_st, tfd);
			close_fd(&tfd);
			if (rv != 0) {
				pr_err("failed to update superblock during re-add\n");
				return -1;
			}
		}
		/* don't even try if disk is marked as faulty */
		errno = 0;
		if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
			if (verbose >= 0)
				pr_err("re-added %s\n", dv->devname);
			return 1;
		}
		if (errno == ENOMEM || errno == EROFS) {
			pr_err("add new device failed for %s: %s\n",
			       dv->devname, strerror(errno));
			if (dv->disposition == 'M')
				return 0;
			return -1;
		}
	}
skip_re_add:
	return 0;
}

/**
 * manage_add_external() - Add disk to external container.
 * @st: external supertype pointer, must not be NULL, superblock is released here.
 * @fd: container file descriptor, must not have O_EXCL mode.
 * @disk_fd: device to add file descriptor.
 * @disk_name: name of the device to add.
 * @disc: disk info.
 *
 * Superblock is released here because any open fd with O_EXCL will block sysfs_add_disk().
 */
mdadm_status_t manage_add_external(struct supertype *st, int fd, char *disk_name,
				   mdu_disk_info_t *disc)
{
	mdadm_status_t rv = MDADM_STATUS_ERROR;
	char container_devpath[MD_NAME_MAX];
	struct dev_policy *pols = NULL;
	struct mdinfo new_mdi;
	struct mdinfo *sra = NULL;
	int container_fd;
	int disk_fd = -1;

	snprintf(container_devpath, MD_NAME_MAX, "%s", fd2devnm(fd));

	container_fd = open_dev_excl(container_devpath);
	if (!is_fd_valid(container_fd)) {
		pr_err("Failed to get exclusive access to container %s\n", container_devpath);
		return MDADM_STATUS_ERROR;
	}

	/* Check if metadata handler is able to accept the drive */
	if (!st->ss->validate_geometry(st, LEVEL_CONTAINER, 0, 1, NULL, 0, 0, disk_name, NULL,
				       0, 1))
		goto out;

	if (mddev_test_and_add_drive_policies(st, &pols, container_fd, 1))
		goto out;

	Kill(disk_name, NULL, 0, -1, 0);

	disk_fd = dev_open(disk_name, O_RDWR | O_EXCL | O_DIRECT);
	if (!is_fd_valid(disk_fd)) {
		pr_err("Failed to exclusively open %s\n", disk_name);
		goto out;
	}

	if (drive_test_and_add_policies(st, &pols, disk_fd, 1))
		goto out;

	if (st->ss->add_to_super(st, disc, disk_fd, disk_name, INVALID_SECTORS))
		goto out;

	if (!mdmon_running(st->container_devnm))
		st->ss->sync_metadata(st);

	sra = sysfs_read(container_fd, NULL, 0);
	if (!sra) {
		pr_err("Failed to read sysfs for %s\n", disk_name);
		goto out;
	}

	sra->array.level = LEVEL_CONTAINER;
	/* Need to set data_offset and component_size */
	st->ss->getinfo_super(st, &new_mdi, NULL);
	new_mdi.disk.major = disc->major;
	new_mdi.disk.minor = disc->minor;
	new_mdi.recovery_start = 0;

	st->ss->free_super(st);

	if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
		pr_err("Failed to add %s to container %s\n", disk_name, container_devpath);
		goto out;
	}
	ping_monitor(container_devpath);
	rv = MDADM_STATUS_SUCCESS;

out:
	close_fd(&container_fd);
	dev_policy_free(pols);

	if (sra)
		sysfs_free(sra);

	if (rv != MDADM_STATUS_SUCCESS)
		/* Metadata handler records this descriptor, so release it only on failure. */
		close_fd(&disk_fd);

	if (st->sb)
		st->ss->free_super(st);

	return rv;
}

int Manage_add(int fd, int tfd, struct mddev_dev *dv,
	       struct supertype *tst, mdu_array_info_t *array,
	       int force, int verbose, char *devname,
	       enum update_opt update, unsigned long rdev,
	       unsigned long long array_size, int raid_slot)
{
	unsigned long long ldsize;
	struct supertype *dev_st;
	int j;
	mdu_disk_info_t disc;
	struct map_ent *map = NULL;
	bool add_new_super = false;

	if (!get_dev_size(tfd, dv->devname, &ldsize)) {
		if (dv->disposition == 'M')
			return 0;
		else
			return -1;
	}

	if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
		/* More than 4TB is wasted on v0.90 */
		if (!force) {
			pr_err("%s is larger than %s can effectively use.\n"
			       "       Add --force is you really want to add this device.\n",
			       dv->devname, devname);
			return -1;
		}
		pr_err("%s is larger than %s can effectively use.\n"
		       "       Adding anyway as --force was given.\n",
		       dv->devname, devname);
	}

	if (array->not_persistent == 0 || tst->ss->external) {

		/* need to find a sample superblock to copy, and
		 * a spare slot to use.
		 * For 'external' array (well, container based),
		 * We can just load the metadata for the array->
		 */
		int array_failed;
		if (tst->sb)
			/* already loaded */;
		else if (tst->ss->external) {
			tst->ss->load_container(tst, fd, NULL);
		} else for (j = 0; j < tst->max_devs; j++) {
				char *dev;
				int dfd;
				disc.number = j;
				if (md_get_disk_info(fd, &disc))
					continue;
				if (disc.major==0 && disc.minor==0)
					continue;
				if ((disc.state & 4)==0) /* sync */
					continue;
				/* Looks like a good device to try */
				dev = map_dev(disc.major, disc.minor, 1);
				if (!dev)
					continue;
				dfd = dev_open(dev, O_RDONLY);
				if (dfd < 0)
					continue;
				if (tst->ss->load_super(tst, dfd,
							NULL)) {
					close_fd(&dfd);
					continue;
				}
				close_fd(&dfd);
				break;
			}
		/* FIXME this is a bad test to be using */
		if (!tst->sb && (dv->disposition != 'a' &&
				 dv->disposition != 'S')) {
			/* we are re-adding a device to a
			 * completely dead array - have to depend
			 * on kernel to check
			 */
		} else if (!tst->sb) {
			pr_err("cannot load array metadata from %s\n", devname);
			return -1;
		}

		/* Make sure device is large enough */
		if (dv->disposition != 'j' &&  /* skip size check for Journal */
		    tst->sb &&
		    tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
		    array_size) {
			if (dv->disposition == 'M')
				return 0;
			pr_err("%s not large enough to join array\n",
			       dv->devname);
			return -1;
		}

		/* Possibly this device was recently part of
		 * the array and was temporarily removed, and
		 * is now being re-added.  If so, we can
		 * simply re-add it.
		 */

		if (array->not_persistent == 0 && dv->disposition != 'S') {
			int rv = 0;

			dev_st = dup_super(tst);
			dev_st->ss->load_super(dev_st, tfd, NULL);

			if (dev_st->sb) {
				rv = attempt_re_add(fd, tfd, dv, dev_st, tst, rdev, update,
						    devname, verbose, array);

				dev_st->ss->free_super(dev_st);
			}

			free(dev_st);

			if (rv)
				return rv;
		}
		if (dv->disposition == 'M') {
			if (verbose > 0)
				pr_err("--re-add for %s to %s is not possible\n",
				       dv->devname, devname);
			return 0;
		}
		if (dv->disposition == 'A') {
			pr_err("--re-add for %s to %s is not possible\n",
			       dv->devname, devname);
			return -1;
		}
		if (array->active_disks < array->raid_disks) {
			char *avail = xcalloc(array->raid_disks, 1);
			int d;
			int found = 0;

			for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
				disc.number = d;
				if (md_get_disk_info(fd, &disc))
					continue;
				if (disc.major == 0 && disc.minor == 0)
					continue;
				if (!(disc.state & (1<<MD_DISK_SYNC)))
					continue;
				avail[disc.raid_disk] = 1;
				found++;
			}
			array_failed = !enough(array->level, array->raid_disks,
					       array->layout, 1, avail);
			free(avail);
		} else
			array_failed = 0;
		if (array_failed) {
			pr_err("%s has failed so using --add cannot work and might destroy\n",
			       devname);
			pr_err("data on %s.  You should stop the array and re-assemble it.\n",
			       dv->devname);
			return -1;
		}
	} else {
		/* non-persistent. Must ensure that new drive
		 * is at least array->size big.
		 */
		if (ldsize/512 < array_size) {
			pr_err("%s not large enough to join array\n",
			       dv->devname);
			return -1;
		}
	}
	/* committed to really trying this device now*/
	remove_partitions(tfd);

	/* in 2.6.17 and earlier, version-1 superblocks won't
	 * use the number we write, but will choose a free number.
	 * we must choose the same free number, which requires
	 * starting at 'raid_disks' and counting up
	 */
	for (j = array->raid_disks; j < tst->max_devs; j++) {
		disc.number = j;
		if (md_get_disk_info(fd, &disc))
			break;
		if (disc.major==0 && disc.minor==0)
			break;
		if (disc.state & 8) /* removed */
			break;
	}
	disc.major = major(rdev);
	disc.minor = minor(rdev);
	if (raid_slot < 0)
		disc.number = j;
	else
		disc.number = raid_slot;
	disc.state = 0;

	/* only add journal to array that supports journaling */
	if (dv->disposition == 'j') {
		struct mdinfo *mdp;

		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
		if (!mdp) {
			pr_err("%s unable to read array state.\n", devname);
			return -1;
		}

		if (mdp->array_state != ARRAY_READONLY) {
			sysfs_free(mdp);
			pr_err("%s is not readonly, cannot add journal.\n", devname);
			return -1;
		}

		sysfs_free(mdp);

		disc.raid_disk = 0;
	}

	if (map_lock(&map))
		pr_err("failed to get exclusive lock on mapfile when add disk\n");

	if (array->not_persistent==0) {
		int dfd;
		if (dv->disposition == 'j')
			disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
		if (dv->writemostly == FlagSet)
			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
		if (dv->failfast == FlagSet)
			disc.state |= 1 << MD_DISK_FAILFAST;
		dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
		if (tst->ss->add_to_super(tst, &disc, dfd,
					  dv->devname, INVALID_SECTORS))
			goto unlock;
		if (tst->ss->write_init_super(tst))
			goto unlock;
		add_new_super = true;
	} else if (dv->disposition == 'A') {
		/*  this had better be raid1.
		 * As we are "--re-add"ing we must find a spare slot
		 * to fill.
		 */
		char *used = xcalloc(array->raid_disks, 1);
		for (j = 0; j < tst->max_devs; j++) {
			mdu_disk_info_t disc2;
			disc2.number = j;
			if (md_get_disk_info(fd, &disc2))
				continue;
			if (disc2.major==0 && disc2.minor==0)
				continue;
			if (disc2.state & 8) /* removed */
				continue;
			if (disc2.raid_disk < 0)
				continue;
			if (disc2.raid_disk > array->raid_disks)
				continue;
			used[disc2.raid_disk] = 1;
		}
		for (j = 0 ; j < array->raid_disks; j++)
			if (!used[j]) {
				disc.raid_disk = j;
				disc.state |= (1<<MD_DISK_SYNC);
				break;
			}
		free(used);
	}

	if (array->state & (1 << MD_SB_CLUSTERED)) {
		if (dv->disposition == 'c')
			disc.state |= (1 << MD_DISK_CANDIDATE);
		else
			disc.state |= (1 << MD_DISK_CLUSTER_ADD);
	}

	if (dv->writemostly == FlagSet)
		disc.state |= (1 << MD_DISK_WRITEMOSTLY);
	if (dv->failfast == FlagSet)
		disc.state |= (1 << MD_DISK_FAILFAST);
	if (tst->ss->external) {
		if (manage_add_external(tst, fd, dv->devname, &disc) != MDADM_STATUS_SUCCESS)
			goto unlock;
	} else {
		tst->ss->free_super(tst);
		if (ioctl(fd, ADD_NEW_DISK, &disc)) {
			if (dv->disposition == 'j')
				pr_err("Failed to hot add %s as journal, "
				       "please try restart %s.\n", dv->devname, devname);
			else
				pr_err("add new device failed for %s as %d: %s\n",
				       dv->devname, j, strerror(errno));
			goto unlock;
		}
		if (dv->disposition == 'j') {
			pr_err("Journal added successfully, making %s read-write\n", devname);
			if (Manage_ro(devname, fd, -1))
				pr_err("Failed to make %s read-write\n", devname);
		}

	}
	if (verbose >= 0)
		pr_err("added %s\n", dv->devname);
	map_unlock(&map);
	return 1;
unlock:
	if (add_new_super)
		Kill(dv->devname, tst, 0, -1, 0);
	map_unlock(&map);
	return -1;
}

int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
		  int sysfd, unsigned long rdev, int force, int verbose, char *devname)
{
	int lfd = -1;
	int err;

	if (tst->ss->external) {
		/* To remove a device from a container, we must
		 * check that it isn't in use in an array.
		 * This involves looking in the 'holders'
		 * directory - there must be just one entry,
		 * the container.
		 * To ensure that it doesn't get used as a
		 * hot spare while we are checking, we
		 * get an O_EXCL open on the container
		 */
		int ret;
		char devnm[32];

		snprintf(devnm, sizeof(devnm), "%s", fd2devnm(fd));
		lfd = open_dev_excl(devnm);
		if (lfd < 0) {
			pr_err("Cannot get exclusive access  to container - odd\n");
			return -1;
		}
		/* We may not be able to check on holders in
		 * sysfs, either because we don't have the dev num
		 * (rdev == 0) or because the device has been detached
		 * and the 'holders' directory no longer exists
		 * (ret == -1).  In that case, assume it is OK to
		 * remove.
		 */
		if (rdev == 0)
			ret = -1;
		else {
			/*
			 * The drive has already been set to 'faulty', however
			 * monitor might not have had time to process it and the
			 * drive might still have an entry in the 'holders'
			 * directory. Try a few times to avoid a false error
			 */
			int count = 20;

			do {
				ret = sysfs_unique_holder(devnm, rdev);
				if (ret < 2)
					break;
				sleep_for(0, MSEC_TO_NSEC(100), true);
			} while (--count > 0);

			if (ret == 0) {
				pr_err("%s is not a member, cannot remove.\n",
					dv->devname);
				close_fd(&lfd);
				return -1;
			}
			if (ret >= 2) {
				pr_err("%s is still in use, cannot remove.\n",
					dv->devname);
				close_fd(&lfd);
				return -1;
			}
		}
	}
	/* FIXME check that it is a current member */
	if (sysfd >= 0) {
		/* device has been removed and we don't know
		 * the major:minor number
		 */
		err = sys_hot_remove_disk(sysfd, force);
	} else {
		err = hot_remove_disk(fd, rdev, force);
		if (err && errno == ENODEV) {
			/* Old kernels rejected this if no personality
			 * is registered */
			struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
			struct mdinfo *dev = NULL;

			if (!sra) {
				err = -1;
			} else {
				for (dev = sra->devs; dev ; dev = dev->next)
					if (dev->disk.major == (int)major(rdev) &&
					    dev->disk.minor == (int)minor(rdev))
						break;

				if (dev)
					err = sysfs_set_str(sra, dev,
						    "state", "remove");
				sysfs_free(sra);
			}
		}
	}
	if (err) {
		pr_err("hot remove failed for %s: %s\n",	dv->devname,
		       strerror(errno));
		close_fd(&lfd);
		return -1;
	}
	if (tst->ss->external) {
		/*
		 * Before dropping our exclusive open we make an
		 * attempt at preventing mdmon from seeing an
		 * 'add' event before reconciling this 'remove'
		 * event.
		 */
		char *devnm = fd2devnm(fd);

		if (!devnm) {
			pr_err("unable to get container name\n");
			close_fd(&lfd);
			return -1;
		}

		ping_manager(devnm);
	}
	close_fd(&lfd);
	if (verbose >= 0)
		pr_err("hot removed %s from %s\n",
		       dv->devname, devname);
	return 1;
}

int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
		   unsigned long rdev, int verbose, char *devname)
{
	struct mdinfo *mdi, *di;
	if (tst->ss->external) {
		pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
		return -1;
	}
	/* Need to find the device in sysfs and add 'want_replacement' to the
	 * status.
	 */
	mdi = sysfs_read(fd, NULL, GET_DEVS);
	if (!mdi || !mdi->devs) {
		pr_err("Cannot find status of %s to enable replacement - strange\n",
		       devname);
		goto abort;
	}
	for (di = mdi->devs; di; di = di->next)
		if (di->disk.major == (int)major(rdev) &&
		    di->disk.minor == (int)minor(rdev))
			break;
	if (di) {
		int rv;
		if (di->disk.raid_disk < 0) {
			pr_err("%s is not active and so cannot be replaced.\n",
			       dv->devname);
			goto abort;
		}
		rv = sysfs_set_str(mdi, di,
				   "state", "want_replacement");
		if (rv) {
			pr_err("Failed to request replacement for %s\n",
			       dv->devname);
			goto abort;
		}
		if (verbose >= 0)
			pr_err("Marked %s (device %d in %s) for replacement\n",
			       dv->devname, di->disk.raid_disk, devname);
		/* If there is a matching 'with', we need to tell it which
		 * raid disk
		 */
		while (dv && dv->disposition != 'W')
			dv = dv->next;
		if (dv) {
			dv->disposition = 'w';
			dv->used = di->disk.raid_disk;
		}
		sysfs_free(mdi);
		return 1;
	}
	pr_err("%s not found in %s so cannot --replace it\n",
	       dv->devname, devname);
abort:
	sysfs_free(mdi);
	return -1;
}

int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
		unsigned long rdev, int verbose, char *devname)
{
	struct mdinfo *mdi, *di;
	/* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
	mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
	if (!mdi || !mdi->devs) {
		pr_err("Cannot find status of %s to enable replacement - strange\n",
		       devname);
		goto abort;
	}
	for (di = mdi->devs; di; di = di->next)
		if (di->disk.major == (int)major(rdev) &&
		    di->disk.minor == (int)minor(rdev))
			break;
	if (di) {
		int rv;
		if (di->disk.state & (1<<MD_DISK_FAULTY)) {
			pr_err("%s is faulty and cannot be a replacement\n",
			       dv->devname);
			goto abort;
		}
		if (di->disk.raid_disk >= 0) {
			pr_err("%s is active and cannot be a replacement\n",
			       dv->devname);
			goto abort;
		}
		rv = sysfs_set_num(mdi, di,
				   "slot", dv->used);
		if (rv) {
			pr_err("Failed to set %s as preferred replacement.\n",
			       dv->devname);
			goto abort;
		}
		if (verbose >= 0)
			pr_err("Marked %s in %s as replacement for device %d\n",
			       dv->devname, devname, dv->used);
		sysfs_free(mdi);
		return 1;
	}
	pr_err("%s not found in %s so cannot make it preferred replacement\n",
	       dv->devname, devname);
abort:
	sysfs_free(mdi);
	return -1;
}

/**
 * is_remove_safe() - Check if remove is safe.
 * @array: Array info.
 * @fd: Array file descriptor.
 * @devname: Name of device to remove.
 * @verbose: Verbose.
 *
 * The function determines if array will be operational
 * after removing &devname.
 *
 * Return: True if array will be operational, false otherwise.
 */
bool is_remove_safe(mdu_array_info_t *array, const int fd, char *devname, const int verbose)
{
	dev_t devid = devnm2devid(devname + 5);
	struct mdinfo *mdi = sysfs_read(fd, NULL, GET_DEVS | GET_DISKS | GET_STATE);
	struct mdinfo *disk;

	if (!mdi) {
		if (verbose)
			pr_err("Failed to read sysfs attributes for %s\n", devname);
		return false;
	}

	char *avail = xcalloc(array->raid_disks, sizeof(char));

	for (disk = mdi->devs; disk; disk = disk->next) {
		if (disk->disk.raid_disk < 0)
			continue;
		if (!(disk->disk.state & (1 << MD_DISK_SYNC)))
			continue;
		if (makedev(disk->disk.major, disk->disk.minor) == devid)
			continue;
		avail[disk->disk.raid_disk] = 1;
	}
	sysfs_free(mdi);

	bool is_enough = enough(array->level, array->raid_disks,
				array->layout, 1, avail);

	free(avail);
	return is_enough;
}

/**
 * Manage_subdevs() - Execute operation depending on devmode.
 *
 * @devname: name of the device.
 * @fd: file descriptor.
 * @devlist: list of sub-devices to manage.
 * @verbose: verbose level.
 * @test: test flag.
 * @update: type of update.
 * @force: force flag.
 *
 * This function executes operation defined by devmode
 * for each dev from devlist.
 * Devmode can be:
 * 'a' - add the device
 * 'S' - add the device as a spare - don't try re-add
 * 'j' - add the device as a journal device
 * 'A' - re-add the device
 * 'r' - remove the device: HOT_REMOVE_DISK
 *       device can be 'faulty' or 'detached' in which case all
 *       matching devices are removed.
 * 'f' - set the device faulty SET_DISK_FAULTY
 *       device can be 'detached' in which case any device that
 *       is inaccessible will be marked faulty.
 * 'I' - remove device by using incremental fail
 *       which is executed when device is removed surprisingly.
 * 'R' - mark this device as wanting replacement.
 * 'W' - this device is added if necessary and activated as
 *       a replacement for a previous 'R' device.
 * -----
 * 'w' - 'W' will be changed to 'w' when it is paired with
 *       a 'R' device.  If a 'W' is found while walking the list
 *       it must be unpaired, and is an error.
 * 'M' - this is created by a 'missing' target.  It is a slight
 *       variant on 'A'
 * 'F' - Another variant of 'A', where the device was faulty
 *       so must be removed from the array first.
 * 'c' - confirm the device as found (for clustered environments)
 *
 * For 'f' and 'r', the device can also be a kernel-internal
 * name such as 'sdb'.
 *
 * Return: 0 on success, otherwise 1 or 2.
 */
int Manage_subdevs(char *devname, int fd,
		   struct mddev_dev *devlist, int verbose, int test,
		   enum update_opt update, int force)
{
	mdu_array_info_t array;
	unsigned long long array_size;
	struct mddev_dev *dv;
	int tfd = -1;
	struct supertype *tst = NULL;
	char *subarray = NULL;
	int sysfd = -1;
	int count = 0; /* number of actions taken */
	struct mdinfo info;
	struct mdinfo devinfo;
	int frozen = 0;
	int busy = 0;
	int raid_slot = -1;

	if (sysfs_init(&info, fd, NULL)) {
		pr_err("sysfs not availabile for %s\n", devname);
		goto abort;
	}

	if (md_get_array_info(fd, &array)) {
		pr_err("Cannot get array info for %s\n", devname);
		goto abort;
	}
	/* array.size is only 32 bits and may be truncated.
	 * So read from sysfs if possible, and record number of sectors
	 */

	array_size = get_component_size(fd);
	if (array_size <= 0)
		array_size = array.size * 2;

	tst = super_by_fd(fd, &subarray);
	if (!tst) {
		pr_err("unsupport array - version %d.%d\n",
			array.major_version, array.minor_version);
		goto abort;
	}

	for (dv = devlist; dv; dv = dv->next) {
		dev_t rdev = 0; /* device to add/remove etc */
		int rv, err = 0;
		int mj, mn;

		if (tst->ss->external && dv->disposition == 'A') {
			pr_err("Cannot re-add member device %s to %s, it is not supported for external metadata, aborting.\n",
			       dv->devname, fd2devnm(fd));
			goto abort;
		}

		if (array.not_persistent == 1 && dv->disposition == 'A') {
			pr_err("Cannot re-add member device %s to %s, array is not persistent, aborting.\n",
			       dv->devname, fd2devnm(fd));
			goto abort;
		}

		raid_slot = -1;
		if (dv->disposition == 'c') {
			rv = parse_cluster_confirm_arg(dv->devname, &dv->devname, &raid_slot);
			if (rv) {
				pr_err("Could not get the devname of cluster\n");
				goto abort;
			}
		}

		if (strcmp(dv->devname, "failed") == 0 || strcmp(dv->devname, "faulty") == 0) {
			if (dv->disposition != 'A' && dv->disposition != 'r') {
				pr_err("%s only meaningful with -r or --re-add, not -%c\n",
					dv->devname, dv->disposition);
				goto abort;
			}
			add_faulty(dv, fd, (dv->disposition == 'A' ? 'F' : 'r'));
			continue;
		}
		if (strcmp(dv->devname, "detached") == 0) {
			if (dv->disposition != 'r' && dv->disposition != 'f') {
				pr_err("%s only meaningful with -r of -f, not -%c\n",
					dv->devname, dv->disposition);
				goto abort;
			}
			add_detached(dv, fd, dv->disposition);
			continue;
		}

		if (strcmp(dv->devname, "missing") == 0) {
			struct mddev_dev *add_devlist;
			struct mddev_dev **dp;

			if (dv->disposition == 'c') {
				rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
				break;
			}

			if (dv->disposition != 'A') {
				pr_err("'missing' only meaningful with --re-add\n");
				goto abort;
			}
			add_devlist = conf_get_devs();
			if (add_devlist == NULL) {
				pr_err("no devices to scan for missing members.\n");
				continue;
			}
			for (dp = &add_devlist; *dp; dp = &(*dp)->next)
				/* 'M' (for 'missing') is like 'A' without errors */
				(*dp)->disposition = 'M';
			*dp = dv->next;
			dv->next = add_devlist;
			continue;
		}

		if (strncmp(dv->devname, "set-", 4) == 0 && strlen(dv->devname) == 5) {
			int copies;

			if (dv->disposition != 'r' && dv->disposition != 'f') {
				pr_err("'%s' only meaningful with -r or -f\n", dv->devname);
				goto abort;
			}

			if (array.level != 10) {
				pr_err("'%s' only meaningful with RAID10 arrays\n", dv->devname);
				goto abort;
			}

			copies = ((array.layout & 0xff) * ((array.layout >> 8) & 0xff));

			if (array.raid_disks % copies != 0 || dv->devname[4] < 'A' ||
			    dv->devname[4] >= 'A' + copies || copies > 26) {
				pr_err("'%s' not meaningful with this array\n", dv->devname);
				goto abort;
			}
			add_set(dv, fd, dv->devname[4]);
			continue;
		}

		if (!strchr(dv->devname, '/')  && !strchr(dv->devname, ':') &&
		    strlen(dv->devname) < 50) {
			char *array_devnm = fd2devnm(fd);

			/* This is a kernel-internal name like 'sda1' */

			if (!strchr("rfI", dv->disposition)) {
				pr_err("%s only meaningful with -r, -f or -I, not -%c\n",
					dv->devname, dv->disposition);
				goto abort;
			}

			sysfd = sysfs_open_memb_attr(array_devnm, dv->devname, "state", O_RDWR);
			if (!is_fd_valid(sysfd)) {
				pr_err("%s does not appear to be a component of %s\n", dv->devname,
					devname);
					goto abort;
				}
		} else if (strchr("rf", dv->disposition) && get_maj_min(dv->devname, &mj, &mn)) {
			/* for 'fail' and 'remove', the device might not exist. */
			rdev = makedev(mj, mn);
		} else {
			tfd = dev_open(dv->devname, O_RDONLY);
			if (tfd >= 0) {
				fstat_is_blkdev(tfd, dv->devname, &rdev);
				close_fd(&tfd);
			} else {
				int open_err = errno;

				if (!stat_is_blkdev(dv->devname, &rdev)) {
					if (dv->disposition == 'M')
						/* non-fatal. Also improbable */
						continue;
					goto abort;
				}
				if (dv->disposition == 'r')
					/* Be happy, the stat worked, that is
					 * enough for --remove
					 */
					;
				else {
					if (dv->disposition == 'M')
						/* non-fatal */
						continue;
					pr_err("Cannot open %s: %s\n", dv->devname,
					       strerror(open_err));
					goto abort;
				}
			}
		}
		switch (dv->disposition) {
		default:
			pr_err("internal error - devmode[%s]=%d\n", dv->devname, dv->disposition);
			goto abort;
		case 'a':
		case 'S': /* --add-spare */
		case 'j': /* --add-journal */
		case 'A':
		case 'M': /* --re-add missing */
		case 'F': /* --re-add faulty  */
		case 'c': /* --cluster-confirm */
			/* add the device */
			if (subarray) {
				pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
				goto abort;
			}

			/* Let's first try to write re-add to sysfs */
			if (rdev != 0 && (dv->disposition == 'A' || dv->disposition == 'F')) {
				sysfs_init_dev(&devinfo, rdev);
				if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
					pr_err("re-add %s to %s succeed\n", dv->devname,
					       info.sys_name);
					break;
				}
			}

			if (dv->disposition == 'F')
				/* Need to remove first */
				hot_remove_disk(fd, rdev, force);
			/* Make sure it isn't in use (in 2.6 or later) */
			tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
			if (tfd >= 0) {
				/* We know no-one else is using it.  We'll
				 * need non-exclusive access to add it, so
				 * do that now.
				 */
				close_fd(&tfd);
				tfd = dev_open(dv->devname, O_RDONLY);
			}
			if (tfd < 0) {
				if (dv->disposition == 'M')
					continue;
				pr_err("Cannot open %s: %s\n",
					dv->devname, strerror(errno));
				goto abort;
			}
			if (!frozen) {
				if (sysfs_freeze_array(&info) == 1)
					frozen = 1;
				else
					frozen = -1;
			}
			rv = Manage_add(fd, tfd, dv, tst, &array, force, verbose, devname, update,
					rdev, array_size, raid_slot);
			close_fd(&tfd);
			if (rv < 0)
				goto abort;
			if (rv > 0)
				count++;
			break;

		case 'r':
			/* hot remove */
			if (subarray) {
				pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
				rv = -1;
			} else
				rv = Manage_remove(tst, fd, dv, sysfd, rdev, verbose, force,
						   devname);
			close_fd(&sysfd);

			if (rv < 0)
				goto abort;
			if (rv > 0)
				count++;
			break;

		case 'f': /* set faulty */
			if (!is_remove_safe(&array, fd, dv->devname, verbose)) {
				pr_err("Cannot remove %s from %s, array will be failed.\n",
				       dv->devname, devname);
				close_fd(&sysfd);
				goto abort;
			}
		case 'I':
			if (is_fd_valid(sysfd)) {
				static const char val[] = "faulty";

				rv = sysfs_write_descriptor(sysfd, val, strlen(val), &err);
			} else {
				rv = ioctl(fd, SET_DISK_FAULTY, rdev);
				if (rv)
					err = errno;
			}

			close_fd(&sysfd);

			if (rv == MDADM_STATUS_SUCCESS) {
				count++;

				pr_vrb("set %s faulty in %s\n", dv->devname, devname);
				break;
			}

			if (err == EBUSY)
				busy = 1;

			pr_err("set device faulty failed for %s: %s\n", dv->devname, strerror(err));
			goto abort;
		case 'R': /* Mark as replaceable */
			if (subarray) {
				pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
				rv = -1;
			} else {
				if (!frozen) {
					if (sysfs_freeze_array(&info) == 1)
						frozen = 1;
					else
						frozen = -1;
				}
				rv = Manage_replace(tst, fd, dv, rdev, verbose, devname);
			}
			if (rv < 0)
				goto abort;
			if (rv > 0)
				count++;
			break;
		case 'W': /* --with device that doesn't match */
			pr_err("No matching --replace device for --with %s\n", dv->devname);
			goto abort;
		case 'w': /* --with device which was matched */
			rv = Manage_with(tst, fd, dv, rdev, verbose, devname);
			if (rv < 0)
				goto abort;
			break;
		}
	}
	free(tst);
	if (frozen > 0)
		sysfs_set_str(&info, NULL, "sync_action", "idle");
	if (test && count == 0)
		return 2;
	return 0;

abort:
	free(tst);
	if (frozen > 0)
		sysfs_set_str(&info, NULL, "sync_action", "idle");
	return !test && busy ? 2 : 1;
}

int autodetect(void)
{
	/* Open any md device, and issue the RAID_AUTORUN ioctl */
	int rv = 1;
	int fd = dev_open("9:0", O_RDONLY);
	if (fd >= 0) {
		if (ioctl(fd, RAID_AUTORUN, 0) == 0)
			rv = 0;
		close_fd(&fd);
	}
	return rv;
}

int Update_subarray(char *dev, char *subarray, enum update_opt update,
		    struct mddev_ident *ident, int verbose)
{
	struct supertype supertype, *st = &supertype;
	int fd, rv = 2;
	struct mdinfo *info = NULL;
	char *update_verb = map_num(update_options, update);
	bool allow_active = update == UOPT_PPL || update == UOPT_NO_PPL;

	memset(st, 0, sizeof(*st));

	fd = open_subarray(dev, subarray, st, verbose < 0);
	if (fd < 0)
		return 2;

	if (!st->ss->update_subarray) {
		if (verbose >= 0)
			pr_err("Operation not supported for %s metadata\n",
			       st->ss->name);
		goto free_super;
	}

	if (!allow_active && is_subarray_active(subarray, st->devnm)) {
		if (verbose >= 0)
			pr_err("Subarray %s in %s is active, cannot update %s\n",
				subarray, dev, update_verb);
		goto free_super;
	}

	if (mdmon_running(st->devnm))
		st->update_tail = &st->updates;

	info = st->ss->container_content(st, subarray);

	if (update == UOPT_PPL && !is_level456(info->array.level)) {
		pr_err("RWH policy ppl is supported only for raid4, raid5 and raid6.\n");
		goto free_super;
	}

	rv = st->ss->update_subarray(st, subarray, update, ident);

	if (rv) {
		if (verbose >= 0)
			pr_err("Failed to update %s of subarray-%s in %s\n",
				update_verb, subarray, dev);
	} else if (st->update_tail)
		flush_metadata_updates(st);
	else
		st->ss->sync_metadata(st);

	if (rv == 0 && update == UOPT_NAME && verbose >= 0)
		pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
		       subarray, dev);

free_super:
	if (info)
		free(info);
	st->ss->free_super(st);
	close_fd(&fd);

	return rv;
}

/* Move spare from one array to another If adding to destination array fails
 * add back to original array.
 * Returns 1 on success, 0 on failure */
int move_spare(char *from_devname, char *to_devname, dev_t devid)
{
	struct mddev_dev devlist;
	char devname[20];

	/* try to remove and add */
	int fd1 = open(to_devname, O_RDONLY);
	int fd2 = open(from_devname, O_RDONLY);

	if (fd1 < 0 || fd2 < 0) {
		close_fd(&fd1);
		close_fd(&fd2);
		return 0;
	}

	devlist.next = NULL;
	devlist.used = 0;
	devlist.writemostly = FlagDefault;
	devlist.failfast = FlagDefault;
	devlist.devname = devname;
	sprintf(devname, "%d:%d", major(devid), minor(devid));

	devlist.disposition = 'r';
	if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, UOPT_UNDEFINED, 0) == 0) {
		devlist.disposition = 'a';
		if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0,
				   UOPT_UNDEFINED, 0) == 0) {
			/* make sure manager is aware of changes */
			ping_manager(to_devname);
			ping_manager(from_devname);
			close_fd(&fd1);
			close_fd(&fd2);
			return 1;
		}
		else
			Manage_subdevs(from_devname, fd2, &devlist,
				       -1, 0, UOPT_UNDEFINED, 0);
	}
	close_fd(&fd1);
	close_fd(&fd2);
	return 0;
}