#!/bin/bash

# backup-statistics version 2.2

set -e

[ -r "/etc/backup.conf" ] && \
  . "/etc/backup.conf"

do_stage()
{
  case $1 in
    1)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'generate lists $filename -> $inode'
        return 0
      fi
      dest="${backups["${backupID}"]%% *}"
      dest="${dest%/}"
      while [ ! -d "${dest}" ] && [ ${maxWait} -gt 0 ]
      do
        sleep 1
        maxWait=$[${maxWait}-1]
      done

      rm -f "${cacheDir}/${backupID}.inodes"
      touch "${cacheDir}/${backupID}.inodes"
      chmod go-rwx "${cacheDir}/${backupID}.inodes"
      for dat in $(ls "${dest}")
      do
        echo "${dat}:"
        find "${dest}/${dat}" -type f -links -64001 -printf '%i %D-%m-%U-%G %p\n' >> \
          "${cacheDir}/${backupID}.inodes"
      done
    ;;
    2)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'sort previous lists by $inode'
        return 0
      fi
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
      touch "${cacheDir}/${backupID}.inodes.sorted"
      chmod go-rwx "${cacheDir}/${backupID}.inodes.sorted"
      sort -T "${tmpDirA}" -T "${tmpDirB}" -u "${cacheDir}/${backupID}.inodes" > \
        "${cacheDir}/${backupID}.inodes.sorted"
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    3)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'generate lists $inode -> $count, $contentHash'
        return 0
      fi
      touch "${cacheDir}/${backupID}.content"
      chmod go-rwx "${cacheDir}/${backupID}.content"
      uniq -cm2 "${cacheDir}/${backupID}.inodes.sorted" | \
        parallel \
          sha512sum {=s/^ *\([[:digit:]]\+ \)\{2\}[0-9-]\+ //=} \| \
            sed '"s|^\([0-9a-f]\{128\}\)  .*\$|\1'{=s/^ *\([[:digit:]]\+ [[:digit:]]\+\) \([0-9-]\+\) .*/-\\2 \\1/=}'|"' \
          \; > \
        "${cacheDir}/${backupID}.content"
    ;;
    4)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'sort previous lists by $contentHash'
        return 0
      fi
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
      touch "${cacheDir}/${backupID}.content.sorted"
      chmod go-rwx "${cacheDir}/${backupID}.content.sorted"
      sort -T "${tmpDirA}" -T "${tmpDirB}" -k1,1 -k2nr,2 "${cacheDir}/${backupID}.content" > \
        "${cacheDir}/${backupID}.content.sorted"
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    5)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'generate sorted lists of groups of inodes with the same hashes'
        return 0
      fi
      index=0
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"
      touch "${cacheDir}/${backupID}.duplicates"
      chmod go-rwx "${cacheDir}/${backupID}.duplicates"
      uniq -m1 --all-repeated=separate "${cacheDir}/${backupID}.content.sorted" | \
        sed 's|^\(\S\+ \)\{2\}||' | \
        while read s
        do
          if [ -z "${s}" ]
          then
            index=$[${index}+1]
          else
            echo "${s#* } B ${index}"
          fi
        done | \
        sort -T "${tmpDirA}" -T "${tmpDirB}" > \
        "${cacheDir}/${backupID}.duplicates"
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    6)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'find files to inodes of previous lists'
        return 0
      fi
      tmpDirA="$(mktemp -d)"
      tmpDirB="$(mktemp -d "${cacheDir}/tmp.XXXXXX")"

      unset block
      unset lastBlock
      unset firstInode
      unset lastInode

      touch "${cacheDir}/${backupID}.duplicates.files"
      chmod go-rwx "${cacheDir}/${backupID}.duplicates.files"
      sed '
        s|^\(\S\+\) \S\+ |\1 F |
      ' "${cacheDir}/${backupID}.inodes.sorted" | \
        sort -m -T "${tmpDirA}" -T "${tmpDirB}" -- \
          - "${cacheDir}/${backupID}.duplicates" | \
        while read -r inode type extra
        do
          if [ "${type}" == "B" ]
          then
            block="${extra}"
          elif [ "${lastInode}" == "${inode}" ] && [ -n "${block}" ]
          then
            echo "${block} ${inode} ${extra}"
          else
            unset block
          fi
          lastInode="${inode}"
        done | \
        sort -T "${tmpDirA}" -T "${tmpDirB}" -k1n,1 | \
        while read -r block inode extra
        do
          if [ "${lastBlock}" != "${block}" ]
          then
            firstInode="${inode}"
          fi
          if [ "${lastBlock}" != "${block}" ] || [ "${firstInode}" != "${inode}" ]
          then
            echo "${block} ${extra}"
          fi
          lastBlock="${block}"
        done | \
        uniq -m1 --group=separate > \
        "${cacheDir}/${backupID}.duplicates.files"
      rmdir "${tmpDirA}" "${tmpDirB}"
    ;;
    7)
      if [ "$2" == '##DESCRIBE##' ]
      then
        echo 'relink files with different inodes and same hashes'
        return 0
      fi
      if [ ! -r "${cacheDir}/next.action" ]
      then
        cat "${cacheDir}/${backupID}.duplicates.files"
      elif [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ]
      then
        startBlock="$(tail -n1 "${cacheDir}/next.action")"
        sed "
          :vor;
            /^${startBlock} /bnach;
            d;
            bvor;
          :nach;
            n;
            bnach
        " "${cacheDir}/${backupID}.duplicates.files"
      fi | \
        while read -r oBlock original
        do
          echo "${backupID}" > "${cacheDir}/next.action2"
          echo "${oBlock}" >> "${cacheDir}/next.action2"
          mv "${cacheDir}/next.action2" "${cacheDir}/next.action"
          while read -r kBlock kopie
          do
            [ -z "${kopie}" ] && break
            if [ "${kBlock}" != "${oBlock}" ]
            then
              >&2 echo "'${kBlock}' != '${oBlock}'"
              >&2 echo "'${backupID}':"
              >&2 echo "'${original}'"
              >&2 echo "'${kopie}'"
              exit 1
            fi

            if ${paranoid}
            then
              diff "${original}" "${kopie}"
            fi
            if [ $(stat -c'%h' "${original}") -ge 65000 ]
            then
              echo "rm \"${original}\""
              echo "ln \"${kopie}\" \"${original}\""
              if ! ${dummy}
              then
                rm "${original}"
                ln "${kopie}" "${original}"
              fi
            else
              echo "rm \"${kopie}\""
              echo "ln \"${original}\" \"${kopie}\""
              if ! ${dummy}
              then
                rm "${kopie}"
                ln "${original}" "${kopie}"
              fi
            fi
          done
        done
      if [ -r "${cacheDir}/next.action" ] && \
        [ "$(head -n1 "${cacheDir}/next.action")" == "${backupID}" ]
      then
        rm -f "${cacheDir}/next.action" "${cacheDir}/next.action2"
      fi
    ;;
  esac
}

usage()
{
  >&2 echo \
'Usage:  backup-statistics [OPTION]
Search and tidy duplicate and not-hardlinked files in the backups.

With no options, tidy up all backups. THIS CAN BE VERY TIME CONSUMING.

Mandatory arguments to long options are mandatory for short options too.
  -d, --dummy         only generate lists, do not modify backupfiles
  -m, --max=maxNum    stop execution after step maxNum
  -p, --paranoid      test for file differences before relinking (test _should_ be obsolete)
  -s, --skip=skipNum  skip first skipNum steps
  --help              display this help and exit
  --version           display version and exit

the executed steps are:'

  for ((stage=1; stage<=7; stage++))
  do
    >&2 echo ''
    >&2 echo "  ${stage}. $(do_stage ${stage} '##DESCRIBE##')"
  done
  >&2 echo ''
  [ -z "$1" ] && exit 1
  exit $1
}

eval set -- "$(
  getopt -o dm:ps: \
    --long dummy \
    --long help \
    --long max: \
    --long paranoid \
    --long skip: \
    --long version \
    -n "$(basename "$0")" -- "$@" || \
  echo usage
)"

dummy=false
maxNum=7
paranoid=false
skipNum=0

while true; do
  case "$1" in
    -d|--dummy)
      dummy=true
      ;;
    --help)
      usage 0
      ;;
    -m|--max)
      shift
      maxNum=$1
      ;;
    -p|--paranoid)
      paranoid=true
      ;;
    -s|--skip)
      shift
      skipNum=$1
      ;;
    --version)
      >&2 echo '2.2'
      exit 0
      ;;
    --)
      shift
      [ $# -gt 0 ] && echo 'ERROR: Unknown parameter: '"$#" && usage
      break
      ;;
    *)
      >&2 echo 'That should not happen, '"$1"' unknown though ...'
      exit -1
      ;;
  esac
  shift
done

if [ ! -d "${cacheDir}" ] || [ -z "${cacheDir}" ]
then
  >&2 "ERROR: Cache directory must exist, '${cacheDir}' does not! Closing."
  exit 1
fi

(
  echo -n 'Signature: '
  echo -n '.IsCacheDirectory' | \
    md5sum - | \
    cut -d ' ' -f 1
  echo '# This file is a cache directory tag created by '"$(basename "$0")"'.'
  echo '# For information about cache directory tags, see:'
  echo '#	http://www.brynosaurus.com/cachedir/'
) > "${cacheDir}/CACHEDIR.TAG"
(
  echo '+ .rsync-filter'
  echo '- *'
) > "${cacheDir}/.rsync-filter"

if [ ! "${skipNum}" -ge 0 ] || \
  [ ! "${skipNum}" -le 7 ] || \
  [ ! "${maxNum}" -ge 0 ] || \
  [ ! "${maxNum}" -le 7 ]
then
  usage
fi

for ((stage=${skipNum}+1; stage<=${maxNum}; stage++))
do
  echo "entering stage ${stage} ($(do_stage ${stage} '##DESCRIBE##')) ..."
  for backupID in "${!backups[@]}"
  do
    echo "${backupID}:"
    do_stage ${stage} "${backupID}"
  done
  echo "... stage ${stage} completed."
done
