#!/usr/bin/perl

=begin metadata

Name: cut
Description: select portions of each line of a file
Author: Rich Lafferty, rich@alcor.concordia.ca
License: perl

=end metadata

=cut

use strict;

use File::Basename qw(basename);
use Getopt::Std qw(getopts);

use constant EX_SUCCESS => 0;
use constant EX_FAILURE => 1;

my $me = basename($0);

my %opt;
getopts('b:c:d:f:ns', \%opt) or usage();

# There's no difference between -b and -c on any unix I
# use regularly -- it's for i18n. Thus, -n is a noop, too.
if (defined $opt{'b'}) {
    usage() if $opt{'f'};
    handle_b($opt{'b'});
}
elsif (defined $opt{'c'}) {
    usage() if $opt{'f'};
    handle_b($opt{'c'});
}
elsif (defined $opt{'f'}) {
    handle_f($opt{'f'}, $opt{'d'}, $opt{'s'});
}
else {
    warn "$me: byte, character or field list required\n";
    usage();
}
exit EX_SUCCESS;

sub checknum {
    my $n = shift;
    if ($n !~ m/\A\-?[0-9]+\Z/) {
        warn "$me: unexpected byte or field number: '$n'\n";
        exit EX_FAILURE;
    }
    if ($n == 0) {
        warn "$me: bytes and fields are numbered from 1\n";
        exit EX_FAILURE;
    }
}

sub parse_fields {
    my $spec = shift;
    my @list = split /,/, $spec;

    my $to_end;
    my @cols;
    foreach my $item (@list) {
        my ($from, $is_range, $to);
        if ($item =~ m/\A([0-9]*)(\-?)([0-9]*)\z/) {
            $from = $1;
            $is_range = $2;
            $to = $3;
        }
        else {
                warn "$me: invalid byte list: '$item'\n";
                exit EX_FAILURE;
        }
        checknum($from) if length($from);
        checknum($to)   if length($to);
        if (!length($from) && !length($to)) { # reject lone '-'
            warn "$me: invalid byte list\n";
            exit EX_FAILURE;
        }
        if (length($from) && length($to)) {
            if ($from > $to) {
                warn "$me: invalid range $from-$to\n";
                exit EX_FAILURE;
            }
            $is_range = 0 if $from == $to;
        }
        if ($is_range) {
            if (length($from) && length($to)) { # a-b
                push @cols, $from .. $to;
            }
            elsif (length $from) { # a-
                push @cols, $from;
                $to_end = 1;
            }
            else { # -a
                push @cols, 1 .. $to;
            }
        }
        else {
            push @cols, $from;
        }
    }
    my @sorted = sort { $a <=> $b } @cols;
    unshift @sorted, $to_end;
    return @sorted;
}

sub handle_b {
    my $spec = shift;
    my ($to_end, @cols) = parse_fields($spec);

    while (<>) {
        chomp;
        my $col = 0;
        my @chars = split //;
        foreach my $c (@cols) {
	    next if $c <= $col;
            print $chars[$c - 1];
            $col = $c;
        }
        if ($to_end) {
            $col++;
            foreach my $c ($col .. scalar(@chars)) {
                print $chars[$c - 1];
            }
        }
        print "\n";
    }
}

sub handle_f {
    my ($spec, $delim, $sflag) = @_;

    my ($to_end, @cols) = parse_fields($spec);
    if (defined $delim) {
        $delim = substr $delim, 0, 1;
    }
    else {
        $delim = "\t";
    }
    while (<>) {
        chomp;

        # Only waste time on lines with delimiters
        if (/$delim/) {
            my @hunk = split /$delim/;
            my $col = 0;
            my @out;
            foreach my $c (@cols) {
	        next if $c <= $col;
                push @out, $hunk[$c - 1];
                $col = $c;
            }
            if ($to_end) {
                $col++;
                foreach my $c ($col .. scalar(@hunk)) {
                    push @out, $hunk[$c - 1];
                }
            }
            print join($delim, @out), "\n";
        }
        else {  # no delimiter in line
            print "$_\n" unless $sflag;
        }
    }
}

sub usage {
    print <<EOT;
usage:  $me -b list [-n] [file ...]
        $me -c list [file ...]
        $me -f list [-d delim] [-s] [file ...]

Each LIST is made up of one range, or many ranges separated by commas.
Each range is one of:

  N     Nth byte, character or field, counted from 1
  N-    from Nth byte, character or field, to end of line
  N-M   from Nth to Mth (included) byte, character or field
  -M    from first to Mth (included) byte, character or field

EOT
    exit EX_FAILURE;
}

__END__

=pod

=head1 NAME

cut - select portions of each line of a file

=head1 SYNOPSIS

cut C<-b> list [C<-n>] [file ...]

cut C<-c> list [file ...]

cut C<-f> list [C<-d> delim] [C<-s>] [file ...]

=head1 DESCRIPTION

The B<cut> utility selects portions of each line (as specified by I<list>)
from each I<file> (or the standard input by default), and writes them to
the standard output.  The items specified by I<list> can be in terms of
column position or in terms of fields delimited by a special
character. Column numbering starts from 1.

I<list> is a comma- or whitespace-separated set of increasing numbers
and/or number ranges.  Number ranges consist of a number, a dash
('-'), and a second number and select the fields or columns from the
first number to the second, inclusive.  Numbers or number ranges may
be preceded by a dash, which selects all fields or columns from 1 to
the first number.  Numbers or number ranges may be followed by a dash,
which selects all fields or columns from the last number to the end of
the line.  Numbers and number ranges may be repeated, overlapping, and
in any order.  It is not an error to select fields or columns not
present in the input line.

=head1 OPTIONS

B<cut> accepts the following options:

=over 4

=item -b list

The I<list> specifies byte positions.

=item -c list

The I<list> specifies character positions.

=item -d string

Use the first character of I<string> as the field delimiter character
instead of the tab character.

=item -f list

The I<list> specifies fields, delimited in the input by a single tab
character.  Output fields are separated by a single tab character.

=item -n

Do not split multi-byte characters.

=item -s

Suppresses lines with no field delimiter characters.  Unless
specified, lines with no delimiters are passed through unmodified.

=back

=head1 BUGS

B<cut> does not understand multibyte characters; the C<-c> and C<-b>
options function identically, and C<-n> does nothing.

=head1 STANDARDS

This B<cut> implementation is compatible with the I<OpenBSD>
implementation.

=head1 AUTHOR

The Perl implementation of B<cut> was written by Rich Lafferty,
I<rich@alcor.concordia.ca>.

=head1 COPYRIGHT and LICENSE

This program is free and open software. You may use, copy, modify,
distribute and sell this program (and any modified variants) in any
way you wish, provided you do not restrict others to do the same.

=cut

