#!/usr/bin/perl -w
#
# copy-by-attr
#
# copy a list of files into a set of directories named according
# to attributes of the files which are being copied.
#
# some use cases:
#  + copying photos from digital camera memory cards
#    into a directory hierarchy based on the dates the images
#    were taken, all in one go
#
#  + creating a set of directories containing links to files
#    based on their size, owner and suffix
#  
#
# usage:
#   [options] sources... target-root
#
#  where:
#    sources...		 a list of files to be copied (or linked to),
#
#    target-root	 the top-level directory of the place where files
#                        will be copied to
#  
# and options can be any of the *attribute* options:
#
#   --date		-- use modification date to group files
#   			   -- format is yyyy-mm-dd
#		           (this is the default if no attribute options are specified)
#
#   --year		-- use modification year to group files
#   			   -- format is yyyy
#
#   --hour		-- use modification hour to group files
#   			   -- format is hh
#
#   --suffix		-- use file suffix to group files
#			   -- files with no suffix get grouped under 'NO-SUFFIX'
#
#   --size		-- use size to group files
#   			   -- directories are named according to
#			         empty,1k,10k,100k,1M,10M,100M,1G,huge
#			      and each file is stashed into the smallest group
#			      that would accommodate it completely
#
#   --user		-- use user name to group files
#
#   --group		-- use group name to group files
#
#
# and any of the *behaviour* options:
#
#   --symlink		-- only symlink to source items, rather than copying them
#
#   --attrname		-- include in the generated path names a component that
#			   names the class of attribute; so, copying by date to
#			   /foo would create subdirs such as /foo/by-date/yyyy-mm-dd/;
#			   this can be useful where it is possible for attribute
#			   values to be duplicated between sets (such as using
#			   --hour and --suffix, and one of the suffixes happened
#			   to be '10', for example)
#
#   --tgtfirst		-- provide the target directory *before* the source arguments
#			   (handy for when you want the program to be invoked from
#			   something like xargs or the --exec option of find)
#
#
################################################################
#
# the target-root is treated as the root of a tree of sub-directories,
# each named according to the rules for an attribute, and the
# individual copied files are put into the corresponding subdirectory.
#
# so, if you copy a bunch of files from a camera, which are named
# R1234567.JPG, (where 1234567 is some arbitrary number, different
# for each image), and you specify that they be copied by date to
# /foo, then each image will end up in a directory such as
# /foo/2008-01-02/R1234567.JPG, where '2008-01-02' is the modification
# date of file R1234567.JPG. 
# 
################################################################
#
# 
# The default is to look at the end of the arg list for the target
# directory, but this can cause problems if you want to invoke
# this from xargs, for example. So,  you may pass the target-root
# on the command line *before* the source files by using the --tgtfirst
# option -- this is contrary to convention, but allows using xargs for
# marshalling lots of files, perhaps as the result of a find.
#
#
#
# Note that if no *attribute* options are passed, then 'date' is presumed.
#
# If you do specify any attribute options, and also wish 'date' to be
# included, you must specifiy it explicitly.
#
#
# Note that, if you specify more than one *attribute* option, then
# at most one copy of each file will be made; all other attribute options
# will be satisfied by using symlinks.  You can, of course, make multiple
# copies by just invoking this command multiple times, once for each
# attribute option.  (And, no, at present you cannot specify which
# attribute option is the copy, and which are the symlinks.)
#
#
# Behaviour options are independent of the attribute options, and their
# presence or absence has no impact on how the attribute options are handled.
#
################################################################
#
# Obvious enhancements:
#
#  + make it process a list of files piped in, rather than explicitly handed
#    to it on the command line.  This is only a minor enhancement, since I've
#    deliberately allowed for 'target-root' to be at the start of the arglist
#    to allow things like xargs to invoke this cleanly.
#
#  + make the attribute(s) of each file that affect how they're copied/linked
#    based on an external process (or an expression, but that could be nightmarish
#    compared with having an external task with a simple calling convention);
#    I'm specifically not succumbing to the temptation to do this just now, because
#    there is no specific problem that I have that needs it, and I can see it being
#    the start of several complete re-writes, culminating in some dynamically-
#    generated, database-backed file system layer that wins smart-arsery awards
#    while solving no existing problems any better than this script does.
#
#
################################################################
#
# Scratched into existence by Frank Wales (frank@limov.com) due to a
# colossally itchy pile of memory cards whose contents needed filing by date.
#
################################################################
#
# What, you want a version number?  Aw, for fuff's sake, it's
# just a little script.
#
# All right then, this is version Pink-and-fluffy-Slash-Apple-Point-Five.
#
# Happy now?
#
################################################################
#
# Changes?  I've had a few. But then again, too few to mention.
#
# (Except for that one where I put the target root back at the
#  end of the arglist, and added the --tgtfirst option.
#  Oh, and switched from gmtime to localtime to make the
#  date-based options more intuitive for pics taken near midnight.)
#
################################################################
#
# You might as well have some permission to copy and change it too,
# I guess, so this program is:
#
#   Copyright Limitless Innovations 2008
#
# and is licensed under your choice of:
#
#   + the GNU General Public License version 2 or later, a copy of which is not attached
#   + the deal we did in the off-licence that you signed with your own blood
#   + some other arrangement, for which you are welcome to contact
#         Frank Wales, being frank@limov.com
#     along with your proposals for exchange of consideration,
#     meeting of minds, et cetera, ad infinitum, dona nobile, allons-y.
#
#
################################################################
#
# And now, our Feature Presentation
#

use strict;
use warnings;

use File::Basename;
use Getopt::Long;
use Cwd;

my $do_symlink ='';
my $do_attrname='';
my $do_tgtfirst='';

my $HERE=cwd;

################################################################
#
#  some path-naming utilities -- all expect to be passed in
#  the path name of the source, as passed in on the command line,
#  and the output of stat on that same file as a 13-element array
#
#  (we could stat the thing, given its name, but since the
#   caller has to stat it anyway, why hit the file system
#   twice for every file?)
#
################################################################

################################################################
#
#  modification date  -- currently uses hard-coded format;
#  suggestions welcome for ways to customize this that aren't
#  hideously complicated -- I guess the POSIXly-correct way
#  would involve $LC_/$LANG-type stuff, but I generally hate
#  those for doing almost the wrong thing in nearly the right way.
#
#  I suppose I'm willing to be persuaded, though...
#

sub mdate_path_for($@) {
    my $filename=shift;
    my @ltm=localtime($_[9]);

    return sprintf('%04d-%02d-%02d',$ltm[5]+1900,$ltm[4]+1,$ltm[3]);
}

sub myear_path_for($@) {
    my $filename=shift;
    my @ltm=localtime($_[9]);

    return $ltm[5]+1900;	#  you don't really want a sprintf here, do you?
}

sub mhour_path_for($@) {
    my $filename=shift;
    my @ltm=localtime($_[9]);

    return sprintf("%02d",$ltm[2]);
}

#
# I think that's enough time-based examples for you to be able
# to cons up your own extra ones if you need them
#
################################################################


################################################################
#
#  this seems to me to be quite ugly -- suggestions welcome for
#  making grouping by size more useful; in particular, making
#  the step sizes more intuitive would be good.
#

sub size_path_for($@) {
    my $filename=shift;
    my $filesize=$_[7];

    # yes, it could be a loop and a hash, but that's really
    # just putting a fancy hat on what's still a horse's butt
    if    ($filesize==0        ) { return 'empty'; }
    elsif ($filesize<1024      ) { return '1k'   ; }
    elsif ($filesize<10240     ) { return '10k'  ; }
    elsif ($filesize<102400    ) { return '100k' ; }
    elsif ($filesize<1048576   ) { return '1M'   ; }
    elsif ($filesize<10485760  ) { return '10M'  ; }
    elsif ($filesize<104857600 ) { return '100M' ; }
    elsif ($filesize<1073741824) { return '1G'   ; }
    else                         { return 'huge' ; }
}

################################################################
#
# falls back to BIG-FUGLY default that is highly unlikely
# ever to be used as an actual suffix
#

my $no_suffix='NO-SUFFIX';

sub suffix_path_for($@) {
    my $filename=shift;		#  this is the only one that currently needs the filename

    if ($filename=~m(\.([^./]*)$)) {  # arbitrarily specify suffix as all that
				     # follows the last '.' as long as it doesn't include a '/'
	return $1;
    }
    else {
	return $no_suffix;
    }
}

################################################################
#
#  uses trivial fall-back when UID is not known locally 
# 

sub user_path_for($@) {
    my $filename=shift;
    my $uid=$_[4];

    return getpwuid($uid) || "uid=$uid";
}

################################################################
#
#  uses trivial fall-back when GID is not known locally 
# 

sub group_path_for($@) {
    my $filename=shift;
    my $gid=$_[5];

    return getgrgid($gid) || "gid=$gid";
}

# the keys for this hash define the names of the
# attribute options that we can be invoked with
my %path_for=(
   date		=>  \&mdate_path_for,
   year		=>  \&myear_path_for,
   hour		=>  \&mhour_path_for,
   suffix	=> \&suffix_path_for,
   size		=>   \&size_path_for,
   user		=>   \&user_path_for,
   group	=>  \&group_path_for
);

my $default_attr='date';

my %invoked_with=();

# start with the behaviour options 
my %options=(
   symlink	=> \$do_symlink,
   attrname	=> \$do_attrname,
   tgtfirst	=> \$do_tgtfirst
);

# then stuff in the attribute options, based on the
# functions we've associated with %path_fors

foreach my $optname (keys %path_for) {
    $invoked_with{$optname}='';
    $options{$optname}=\$invoked_with{$optname};
}

GetOptions(%options);

# after that, only non-command-line options ought to remain in @ARGV

if ($#ARGV<1) {
   print STDERR $0,": too few arguments\n  usage: $0 [options] source[...] target-root\n     or: $0 [options]  --tgtfirst target-root source[...]\n  where options can be any of:\n";

   foreach my $opt (sort keys %options) {
       print STDERR "      --$opt\n";
   }
   
   exit 2;
}

unless (grep /1/, @invoked_with{keys %path_for}) {
    $invoked_with{$default_attr}=1;
}

my $target;

if ($do_tgtfirst) {
    # take the *first* available non-option argument, which is contrary to
    # standard copying conventions, but enables xargs-type shenanigans
    $target=shift(@ARGV);
}
else {
    # take the last available non-option argument
    $target=pop(@ARGV);
}

# if $target ends in a / (which might have been put there by the
# shell if markdirs is turned on), then strip it, if only to
# keep tidy any symlinks that we generate
chop $target if substr($target,-1) eq '/';

my @sources=@ARGV;

# now, prep the target area

unless (-d $target && -w $target) {
   print STDERR $0,": cannot write to target directory '",$target,"'\n";
   exit 1;
}

my %attr_paths=();

foreach my $attr (keys %path_for) {
    $attr_paths{$attr}=$do_attrname?"by-$attr/":"";
}


# we allow multiple attribute copies, why not?
#
# however, in this case, if we've been asked to copy,
# we only copy to the first attribute, and symlink the
# rest to it
#
# if we've been explicitly asked to symlink, we symlink
# everything and copy nothing
#
# note that, at present, we don't have a way of specifying
# which attribute should be the copied one, to which all
# the other will symlink
#
# (also, the symlinking is dumb, in that it uses the
# full path name, rather than the shortest relative pathname,
# which would make the target tree re-parentable -- feel free
# to fix this egregious oversight if it matters to you)
#


SOURCE:
foreach my $src (@sources) {
    my @srcstat=stat $src;
    my $src_basename=basename($src);

    unless ($#srcstat==12) {
      print STDERR $0,": cannot stat '$src' -- $!\n";
      next SOURCE;
    }

    my @targets=();

    for my $optname (keys %path_for) {
	if ($invoked_with{$optname}) {
	    push (@targets,$attr_paths{$optname}.$path_for{$optname}($src,@srcstat));

	    mkdir "$target/$attr_paths{$optname}",0775 unless -d "$target/$attr_paths{$optname}";
	}
    }

    my $copy='';
    
    $copy=shift(@targets) unless $do_symlink;

    my $copypath='';
    my $copytgt='';

    # do the copy
    if ($copy) {
        $copypath="$target/$copy";

	mkdir $copypath,0775 unless -d $copypath;

	unless (-d $copypath) {
	    print STDERR $0,": cannot create directory '$copypath' -- $!\n";
	    next SOURCE;
	}

	# we don't attempt to be recursive -- we depend on the invoker
	# to do that for us, perhaps with find or something, instead
	#
	# we are lenient about copying from symlinks, though
	unless (-f $src || -l $src) {
	    print STDERR $0,": skipping non-copyable item '$src'\n";
	    next SOURCE;
	}

        unless (open(IN,"<",$src)) {
	    print STDERR $0,": cannot open '$src' for reading -- $!\n";
	    next SOURCE;
	}

	$copytgt="$copypath/$src_basename";

	unless (open(OUT,">",$copytgt)) {
	    print STDERR $0,": cannot open '$copytgt' for writing -- $!\n";
	    close IN;
	    next SOURCE;
	}

	# maybe we should check for write failures, eh?
	while(<IN>) {
	    unless (print OUT) {
		print STDERR $0,": error writing to '$copytgt' -- $!\n";

		close OUT;
		close IN;

		exit 1;		# write failure is serious enough to scrub the mission
	    }
	}

	close OUT;
	close IN;

	# haven't decided if it's worth whining about failures of these,
	# since many reasonable copies will actually fail for some or all of
	# these, because of system policies (can't chown), or file system
	# limitations (VFAT, for example)
	chmod $srcstat[2],	      $copytgt;
	utime $srcstat[8],$srcstat[9],$copytgt;
	chown $srcstat[4],$srcstat[5],$copytgt;
    }

    foreach my $link (@targets) {
	my $linkpath="$target/$link";

	mkdir $linkpath,0775 unless -d $linkpath;

	unless (-d $linkpath) {
	    print STDERR $0,": cannot create directory '$linkpath' -- $!\n";
	    next SOURCE;
	}

	if ($copytgt) {
	    symlink $copytgt,"$linkpath/$src";	    
	}
	else {
	    # need the full path to $src, really
	     if ($src=~/^\//) {
	       	symlink $src,"$linkpath/$src";
	     }
	     else {
	        symlink "$HERE/$src","$linkpath/$src";
	     }		
	}
    }
}

exit 0;
