#!/bin/bash

# lfs_migrate: a simple tool to copy and check files.
#
# To avoid allocating objects on one or more OSTs, they should be
# deactivated on the MDS via "lctl --device {device_number} deactivate",
# where {device_number} is from the output of "lctl dl" on the MDS.
#
# To guard against corruption, the file is compared after migration
# to verify the copy is correct and the file has not been modified.
# This is not a protection against the file being open by another
# process, but it would catch the worst cases of in-use files, but
# to be 100% safe the administrator needs to ensure this is safe.

RSYNC=${RSYNC:-rsync}
OPT_RSYNC=${LFS_MIGRATE_RSYNC_MODE:-false}
ECHO=echo
LFS=${LFS:-lfs}
RSYNC_WITH_HLINKS=false
LFS_MIGRATE_TMP=${TMPDIR:-/tmp}
MIGRATED_SET="$(mktemp ${LFS_MIGRATE_TMP}/lfs_migrate.links.XXXXXX)"
NEWNAME=""
REMOVE_FID='s/^\[[0-9a-fx:]*\] //'
PROG=$(basename $0)

add_to_set() {
	local old_fid="$1"
	local path="$2"

	echo "$old_fid $path" >> "$MIGRATED_SET"
}

path_in_set() {
	local path="$1"

	sed -e "$REMOVE_FID" $MIGRATED_SET | grep -q "^$path$"
}

old_fid_in_set() {
	local old_fid="$1"

	grep "^\\$old_fid" "$MIGRATED_SET" | head -n 1 |
		sed -e "$REMOVE_FID"
}

usage() {
    cat -- <<USAGE 1>&2
usage: lfs_migrate [--dry-run|-n] [--help|-h] [--no-rsync|--rsync] [--quiet|-q]
		   [--auto-stripe|-A [-C <cap>]
		   [--min-free|-M <min_free>] [--max-free|-X <max_free>]]
		   [--pool|-p <pool>] [--stripe-count|-c <stripe_count>]
		   [--stripe-size|-S <stripe_size>]
		   [-D] [-h] [-n] [-S]
		   [--restripe|-R] [--skip|-s] [--verbose|-v] [--yes|-y] [-0]
		   [FILE|DIR...]
	-A         restripe file using an automatically selected stripe count,
		   uses stripe_count = sqrt(size_in_GB) + 1
	-c <stripe_count>
		   restripe file using the specified <stripe_count>
	-C <cap>   when -A is set, limit the migrated file to use on each OST
		   at most 1/<cap> of the available space of the smallest OST
	-D         do not use direct I/O to copy file contents
	-h         show this usage message
	-M <min_free>
		   when -A is set, an OST must contain more available space than
		   <min_free> KB in order for it to be considered available for
		   use in the migration
	--no-rsync do not fall back to rsync mode even if lfs migrate fails
	-n         only print the names of files to be migrated
	-p <pool>  use the specified OST pool for the destination file
	-q         run quietly (don't print filenames or status)
	--rsync    force rsync mode instead of using lfs migrate
	-R         restripe file using default directory striping
	-s         skip file data comparison after migrate
	-S <stripe_size>
		   restripe file using the specified stripe size
	-v         show verbose debug messages
	-X <max_free>
		   when -A is set, limit the amount of space on each OST that
		   can be considered available for the migration to
		   <max_free> KB
	-y         answer 'y' to usage question
	-0         input file names on stdin are separated by a null character

Options '-A', '-c', and '-R' are mutually exclusive.
Options '-C', '-M', and '-X' are ignored if '-A' is not set.

The --rsync and --no-rsync options may not be specified at the same time.

If a directory is an argument, all files in the directory are migrated.
If no file/directory is given, the file list is read from standard input.

Any arguments that are not explicitly recognized by the script are passed
through to the 'lfs migrate' utility.

Examples:
      lfs_migrate /mnt/lustre/dir
      lfs_migrate -p newpool /mnt/lustre/dir
      lfs find /test -O test-OST0004 -size +4G | lfs_migrate -y
USAGE
    exit 1
}

cleanup() {
	rm -f "$MIGRATED_SET"
	[ -n "$NEWNAME" ] && rm -f "$NEWNAME"
}

trap cleanup EXIT

OPT_CHECK=true
OPT_DEBUG=false
OPT_DRYRUN=false
OPT_FILE=()
OPT_LAYOUT=()
OPT_COMP=false
OPT_NO_RSYNC=false
OPT_NO_DIRECT=false
OPT_NULL=false
OPT_PASSTHROUGH=()
OPT_POOL=""
OPT_RESTRIPE=false
OPT_YES=false
OPT_AUTOSTRIPE=false
OPT_STRIPE_COUNT=""
OPT_STRIPE_SIZE=""
OPT_MINFREE=262144
OPT_MAXFREE=""
OPT_CAP=100

# Examine any long options and arguments.  getopts does not support long
# options, so they must be stripped out and classified as either options
# for the script, or passed through to "lfs migrate".
while [ -n "$*" ]; do
	arg="$1"
	case "$arg" in
	-h|--help) usage;;
	-l|--link) ;; # maintained backward compatibility for now
	-n) OPT_DRYRUN=true; OPT_YES=true
	   echo "$PROG: -n deprecated, use --dry-run or --non-block" 1>&2;;
	--dry-run) OPT_DRYRUN=true; OPT_YES=true;;
	-p|--pool) OPT_POOL="$arg $2"; OPT_LAYOUT+="$OPT_POOL "; shift;;
	-q|--quiet) ECHO=:;;
	-R|--restripe) OPT_RESTRIPE=true;;
	-s|--skip) OPT_CHECK=false;;
	-v|--verbose) OPT_DEBUG=true; ECHO=echo;;
	-y|--yes) OPT_YES=true;;
	-0) OPT_NULL=true;;
	-b|--block|--non-block|--non-direct|-D|--no-verify)
	   # Always pass non-layout options to 'lfs migrate'
	   OPT_PASSTHROUGH+=("$arg");;
	--rsync) OPT_RSYNC=true;;
	--no-rsync) OPT_NO_RSYNC=true;;
	--copy|--yaml|--file) OPT_COMP=true;
	   # these options have files as arguments, pass both through
	   OPT_LAYOUT+="$arg $2 "; shift;;
	--auto-stripe|-A) OPT_AUTOSTRIPE=true;;
	-C) OPT_CAP="$2"; shift;;
	-M|--min-free) OPT_MINFREE="$2"; shift;;
	-X|--max-free) OPT_MAXFREE="$2"; shift;;
	-c|--stripe-count) OPT_STRIPE_COUNT="$2"; shift;;
	-S|--stripe-size) OPT_STRIPE_SIZE="$2"; shift;;
	*) # Pass other non-file layout options to 'lfs migrate'
	   [ -e "$arg" ] && OPT_FILE+="$arg " && break || OPT_LAYOUT+="$arg "
	esac
	shift
done

if $OPT_RESTRIPE || $OPT_AUTOSTRIPE && [ -n "$OPT_LAYOUT" ]; then
	echo "$PROG error: Options '$OPT_LAYOUT' can't be used with -R or -A" \
		1>&2
	exit 1
elif $OPT_RESTRIPE && [[ "$OPT_STRIPE_COUNT" || "$OPT_STRIPE_SIZE" ]]; then
	echo "$PROG error: Option -R can't be used with -c or -S" 1>&2
	exit 1
elif $OPT_AUTOSTRIPE && [ -n "$OPT_STRIPE_COUNT" ]; then
	echo "$PROG error: Option -A can't be used with -c" 1>&2
	exit 1
elif $OPT_AUTOSTRIPE && $OPT_RESTRIPE; then
	echo "$PROG error: Option -A can't be used with -R" 1>&2
	exit 1
fi

if $OPT_RSYNC && $OPT_NO_RSYNC; then
	echo "$PROG: Options --rsync and --no-rsync may not be" \
		"specified at the same time." 1>&2
	exit 1
fi

if ! $OPT_YES; then
	echo ""
	echo "lfs_migrate is currently NOT SAFE for moving in-use files." 1>&2
	echo "Use it only when you are sure migrated files are unused." 1>&2
	echo "" 1>&2
	echo "If emptying an OST that is active on the MDS, new files may" 1>&2
	echo "use it.  To stop allocating any new objects on OSTNNNN run:" 1>&2
	echo "  lctl set_param osp.<fsname>-OSTNNNN*.max_create_count=0'" 1>&2
	echo "on each MDS using the OST(s) being emptied." 1>&2
	echo -n "Continue? (y/n) "
	read CHECK
	[ "$CHECK" != "y" -a "$CHECK" != "yes" ] && exit 1
fi

# if rsync has --xattr support, then try to copy the xattrs.
$RSYNC --help 2>&1 | grep -q xattr && RSYNC_OPTS="$RSYNC_OPTS -X"
$RSYNC --help 2>&1 | grep -q acls && RSYNC_OPTS="$RSYNC_OPTS -A"
# If rsync copies lustre xattrs in the future, then we can skip lfs (bug 22189)
strings $(which $RSYNC) 2>&1 | grep -q lustre && LFS=:

# rsync creates its temporary files with lenient permissions, even if
# permissions on the original files are more strict. Tighten umask here
# to avoid the brief window where unprivileged users might be able to
# access the temporary file.
umask 0077

# Use stripe count = sqrt(size_in_GB) + 1, but cap object size per OST.
function calc_stripe()
{
	local filename=$1
	local filekb=$2
	local obj_max_kb=$3
	local filegb=$((filekb / 1048576))
	local stripe_count=1
	local ost_max_count=0

	# Files up to 1GB will have 1 stripe if they fit within the object max
	if [[ $filegb -lt 1 && "$obj_max_kb" && $filekb -le $obj_max_kb ]]; then
		echo 1 "$obj_max_kb" && return
	fi

	stripe_count=$(bc <<< "scale=0; 1 + sqrt($filegb)" 2> /dev/null) ||
		{ echo "cannot auto calculate stripe count" >&2; return; }

	if [ -z "$obj_max_kb" ]; then
		local ost_min_kb=$((1 << 62))

		# Calculate cap on object size at 1% of smallest OST
		# but only include OSTs that have 256MB+ available space
		while IFS='' read avail; do
			[[ "$OPT_MAXFREE" && $avail -gt $OPT_MAXFREE ]] &&
				avail=$OPT_MAXFREE
			if [ $avail -ge $OPT_MINFREE ]; then
				ost_max_count=$((ost_max_count + 1))
				if [ $avail -lt $ost_min_kb ]; then
					ost_min_kb=$avail
				fi
			fi
		done < <($LFS df $OPT_POOL "$OLDNAME" | awk '/OST/ { print $4 }')

		if [ $ost_max_count -eq 0 ]; then
			# no OSTs with enough space, stripe over all of them
			echo "-1" "0"
			return
		fi

		if (( ost_min_kb == (1 << 62) )); then
			echo "warning: unable to determine minimum OST size, " \
			     "object size not capped" >&2
			echo "$stripe_count" "0"
			return
		fi

		obj_max_kb=$((ost_min_kb / $OPT_CAP))
	elif [ $obj_max_kb -eq 0 ]; then
		echo "warning: unable to determine minimum OST size " \
		     "from previous migrate, object size not capped" >&2
		echo "$stripe_count" "$obj_max_kb"
		return
	fi

	# If disk usage would exceed the cap, increase the number of stripes.
	# Round up to the nearest MB to ensure file will fit.
	(( filekb > stripe_count * obj_max_kb )) &&
		stripe_count=$(((filekb + obj_max_kb - 1024) / obj_max_kb))

	# Limit the count to the number of eligible OSTs
	if [ "$stripe_count" -gt $ost_max_count ]; then
		echo "$ost_max_count" "$obj_max_kb"
	else
		echo "$stripe_count" "$obj_max_kb"
	fi
}

lfs_migrate() {
	local last_dev
	local mntpoint

	while IFS='' read -d '' OLDNAME; do
		local hlinks=()
		local layout
		local fid

		$ECHO -n "$OLDNAME: "

		# avoid duplicate stat call by fetching all attrs at once
		local nlink_idx_link=0 # %h is the hard link count
		local nlink_idx_type=1 # %F is "regular file", ignore others
		local nlink_idx_file=2 #       "file" is here
		local nlink_idx_size=3 # %s is file size in bytes
		local nlink_idx_dev=4  # %D is the underlying device number
		# nlink_type=(1 regular file 1234 0x810)
		local nlink_type=($(LANG=C stat -c "%h %F %s %D" "$OLDNAME" \
				 2> /dev/null))

		# skip non-regular files, since they don't have any objects
		# and there is no point in trying to migrate them.
		if [ "${nlink_type[$nlink_idx_type]}" != "regular" ]; then
			echo -e "\r$OLDNAME: not a regular file, skipped" 1>&2
			continue
		fi

		# working out write perms is hard, let the shell do it
		if [ ! -w "$OLDNAME" ]; then
			echo -e "\r$OLDNAME: no write permission, skipped" 1>&2
			continue
		fi

		if $OPT_DRYRUN && ! $OPT_DEBUG; then
			$ECHO "dry run, skipped"
			continue
		fi

		# xattrs use absolute file paths, so ensure provided path is
		# also absolute so that the names can be compared
		local oldname_absolute=$(readlink -f "$OLDNAME")
		if [ -z "$oldname_absolute" ]; then
			echo -e "\r$OLDNAME: cannot resolve full path, skipped" 1>&2
			continue
		fi
		OLDNAME=$oldname_absolute

		if [[ ${nlink_type[$nlink_idx_link]} -gt 1 ]] ||
		   $RSYNC_WITH_HLINKS; then
			fid=$($LFS path2fid "$OLDNAME" 2> /dev/null)
			if [ $? -ne 0 ]; then
				echo -e "\r$OLDNAME: cannot get FID, skipping; is this a Lustre file system?" 1>&2
				continue
			fi

			# don't migrate a hard link if it was already migrated
			if path_in_set "$OLDNAME"; then
				$ECHO "already migrated via another hard link"
				continue
			fi

			# There is limited space available in the xattrs
			# to store all of the hard links for a file, so it's
			# possible that $OLDNAME is part of a link set but is
			# not listed in xattrs and therefore not listed as
			# being migrated.
			local migrated=$(old_fid_in_set "$fid")
			if [ -n "$migrated" ]; then
				$ECHO "already migrated via another hard link"
				# Only the rsync case has to relink.  The
				# "lfs migrate" case keeps the same inode so
				# all of the links are already correct.
				$OPT_RSYNC && [ "$migrated" != "$OLDNAME" ] &&
					ln -f "$migrated" "$OLDNAME"

				add_to_set "$fid" "$OLDNAME"
				continue;
			fi
		fi

		local olddir=$(dirname "$OLDNAME")
		local stripe_size="$OPT_STRIPE_SIZE"
		local stripe_count="$OPT_STRIPE_COUNT"
		local stripe_opts="-N --comp-count -c -S -p -y"
		local parent_count=""
		local parent_size=""
		local stripe_pool="${OPT_POOL#-p }"
		local mirror_count=1
		local comp_count=0
		# avoid multiple getstripe calls
		#   lcm_mirror_count:  1
		#   lcm_entry_count:   0
		#      lmm_stripe_count:  1
		#      lmm_stripe_size:   1048576
		#      lmm_pool:          pool_abc
		local l_mirror_count=0
		local l_comp_count=1
		local l_stripe_count=2
		local l_stripe_size=3
		local l_stripe_pool=4
		local layout_info=($($LFS getstripe $stripe_opts "$OLDNAME" \
			2>/dev/null | awk '{ print $2 }'))

		layout="${OPT_PASSTHROUGH[@]} "

		if $OPT_RESTRIPE; then
			UNLINK=""
			layout+="--copy $olddir"
			OPT_COMP=true
		else
			# If rsync copies Lustre xattrs properly in the future
			# (i.e. before the file data, so that it preserves
			# striping) then we don't need this getstripe stuff.
			UNLINK="-u"

			[ -n "$OPT_POOL" ] ||
				stripe_pool=${layout_info[$l_stripe_pool]}
			mirror_count=${layout_info[$l_mirror_count]}

			if $OPT_AUTOSTRIPE; then
				local filekb=$((${nlink_type[$nlink_idx_size]} /
						1024))

				read stripe_count OBJ_MAX_KB < <(calc_stripe \
					"$OLDNAME" "$filekb" "$OBJ_MAX_KB")
				[ -z "$stripe_count" ] && exit 1
				[ $stripe_count -lt 1 ] && stripe_count=1
			else
				[ -n "$stripe_count" ] ||
					stripe_count=${layout_info[$l_stripe_count]}
			fi
			[ -n "$stripe_size" ] ||
				stripe_size=${layout_info[$l_stripe_size]}

			[ -z "$stripe_count" -o -z "$stripe_size" ] && UNLINK=""
		fi

		if $OPT_DEBUG; then
			local parent_count
			local parent_size
			local parent_layout

			if $OPT_RESTRIPE; then
				parent_layout=($($LFS getstripe $stripe_opts \
						-d "$olddir" 2>/dev/null |
						awk '{print $2 }'))
				parent_count=${parent_layout[$l_stripe_count]}
				parent_size=${parent_layout[$l_stripe_size]}
				stripe_pool=${parent_layout[$l_stripe_pool]}
				mirror_count=${parent_layout[$l_mirror_count]}
			fi

			$ECHO -n "stripe_count=${stripe_count:-$parent_count},stripe_size=${stripe_size:-$parent_size}"
			[ -n "$stripe_pool" ] &&
				$ECHO -n ",pool=${stripe_pool}"
			[[ $mirror_count -gt 1 ]] &&
				$ECHO -n ",mirror_count=${mirror_count}"
			$ECHO -n " "
		fi

		if $OPT_DRYRUN; then
			$ECHO " dry run, skipped"
			continue
		fi

		if ! $OPT_COMP && [ ${layout_info[$l_comp_count]} -gt 0 ]; then
			layout+="--copy $OLDNAME"
			OPT_COMP=true
		fi
		if ! $OPT_COMP; then
			[ -n "$stripe_count" ] && layout+="-c $stripe_count "
			[ -n "$stripe_size" ] && layout+="-S $stripe_size "
			[ -n "$OPT_POOL" -a -n "$stripe_pool" ] &&
						layout+="-p $stripe_pool "
			[[ $mirror_count -gt 1 ]] && layout+="-N $mirror_count "
		fi
		layout+="$OPT_LAYOUT"

		# detect other hard links and store them on a global
		# list so we don't re-migrate them
		if [[ ${nlink_type[$nlink_idx_link]} -gt 1 ]]; then
			[ "${nlink_type[$nlink_idx_dev]}" == "$last_dev" ] ||
				mntpoint=$(df -P "$OLDNAME" |
					   awk 'NR==2 { print $NF }')
			if [ -z "$mntpoint" ]; then
				echo -e "\r$OLDNAME: cannot determine mount point; skipped" 1>&2
				continue
			fi
			hlinks=$($LFS fid2path "$mntpoint" "$fid" 2> /dev/null)
			if $OPT_RSYNC && [ $? -ne 0 ]; then
				echo -e "\r$OLDNAME: cannot determine hard link paths, skipped" 1>&2
				continue
			fi
			hlinks+=("$OLDNAME")
		else
			hlinks=
		fi

		# first try to migrate via Lustre tools, then fall back to rsync
		if ! $OPT_RSYNC; then
			if $OPT_DEBUG; then
				echo -e "\n$LFS migrate $layout \"$OLDNAME\""
			fi
			if $LFS migrate $layout "$OLDNAME"; then
				$ECHO "done"
				# no-op if hlinks empty for 1-link files
				for link in ${hlinks[*]}; do
					add_to_set "$fid" "$link"
				done
				continue
			elif $OPT_NO_RSYNC; then
				echo -e "\r$OLDNAME: refusing to fall back to rsync, skipped" 1>&2
				continue
			else
				$ECHO -n "falling back to rsync: "
				OPT_RSYNC=true
			fi
		fi

		local oldfile=$(basename "$OLDNAME")
		NEWNAME=$(mktemp $UNLINK "$olddir/.$oldfile.XXXXXX")
		if [ $? -ne 0 -o -z "$NEWNAME" ]; then
			echo -e "\r$OLDNAME: cannot make temp file, skipped" 1>&2
			continue
		fi

		if [ "$UNLINK" ]; then
			if ! $LFS setstripe $layout "$NEWNAME"; then
				echo -e "\r$NEWNAME: setstripe failed, exiting" 1>&2
				exit 2
			fi
		fi

		# we use --inplace, since we created our own temp file already
		if ! $RSYNC -a --inplace $RSYNC_OPTS "$OLDNAME" "$NEWNAME";then
			echo -e "\r$OLDNAME: copy error, exiting" 1>&2
			exit 4
		fi

		if $OPT_CHECK && ! cmp -s "$OLDNAME" "$NEWNAME"; then
			echo -e "\r$NEWNAME: compare failed, exiting" 1>&2
			exit 8
		fi

		if ! mv "$NEWNAME" "$OLDNAME"; then
			echo -e "\r$OLDNAME: rename error, exiting" 1>&2
			exit 12
		fi

		$ECHO "done rsync"
		# no-op if hlinks empty for 1-link files
		for link in ${hlinks[*]}; do
			if [ "$link" != "$OLDNAME" ]; then
				ln -f "$OLDNAME" "$link"
			fi
			add_to_set "$fid" "$link"
		done

		# If the number of hlinks exceeds the space in the xattrs,
		# when the final path is statted it will have a link count
		# of 1 (all other links will point to the new inode).
		# This flag indicates that even paths with a link count of
		# 1 are potentially part of a link set.
		(( ${#hlinks[*]} == 1 )) || RSYNC_WITH_HLINKS=true
	done
}

if [ "$#" -eq 0 ]; then
	if $OPT_NULL; then
		lfs_migrate
	else
		tr '\n' '\0' | lfs_migrate
	fi
else
	while [ "$1" ]; do
		if [ -d "$1" ]; then
			$LFS find "$1" -type f -print0
		else
			echo -en "$1\0"
		fi
		shift
	done | lfs_migrate
fi

