#!/usr/bin/perl
# ++ 0.009 2021-11-09 13:23:18 44e9d7087452bb6fa3ba4117f89228996787a912
use strict;
use warnings;
use 5.026;

# ==================================================================================
# Modification history
# 09/11/2021  Reformat cn's and names in the form "Blatz, Joe"
# 31/10/2021  Handle address lists 
# 28/10/2021  Handle email addresses in duplicates better
# 27/10/2021  Reject groups; allow deletion of output file
# 27/10/2021  Handle given names and surnames better; handle dups better.
# 26/10/2021  Check for duplicate dn's; supply ou and prefix
# 25/10/2021  Initial version
# ==================================================================================

=pod

=head1 NAME

abookrft - Reformats an exported Thunderbird address book into valid LDIF

=head1 SYNOPSIS

  abookrft --ou "OU spec" --prefix "prefix spec"
           [ --output /some/file ] [ -x ] [ --dups /some/other/file ]
           [-h|--help] [-V] [-z] address-book-file

=head1 DESCRIPTION

Makes changes to an exported TBird address book which were found to be necessary in testing.

=head1 OPTIONS

=over

=item --ou

Specify the Organisational Unit within the LDAP database in which the address book entries are kept.  Required.

=item --prefix

Specifies the LDAP prefix for the address book database, for example "cn=example,cn=org,cn=uk".  Required.

=item --output

Specifies an output file name.  If this option is not specified, the output file resulting from an
input address book named "abook.ldif" will be "abook-transformed.ldif".  If that file already exists the program
will stop, unless the B<-x> option is coded.

The output file may not be the same as the input file.

=item --dups

Specify a file into which the CSV-formatted duplicate information will be written.  Default is B<filename-dups.csv>.

=item -x

Delete the output file before starting.

=item -h | --help

Prints documentation and exits.

=item -V

Prints the current version of the program and exits.

=item -z

Provides additional diagnostic information.

=back

=cut

package LDIFStanza;

use 5.010;
use strict;
use warnings;
use Carp;
use Data::Dumper;
use Email::Address;
use POSIX qw(mktime);
use Text::Balanced qw( extract_delimited );

sub new{
  my ( $class, $text, $our_ou, $our_prefix, $debug ) = @_;
  croak "[LDIFStanza-new] no prefix\n" unless $our_prefix;
  my $self = {};
  for my $l ( split( "\n", $text ) ){
    next unless $l;
    next if substr($l,0,1) eq '#';
    if ( $l =~ /^(.*): (.*)/ ){
      my ( $key, $data ) = ( $1, $2 );
      #print "Key=$key, data='$data'\n" if $debug;
      push @{$self->{$key}}, $data;  # Always a list even if single valued
    }
  }
  $self->{'original_text'} = $text;

  # We use an explicit cn as such; if it's not there then we look for a quote-delimited
  # name in the email field and use that.  If that's not there then we'll use the value
  # of the "mail" attribute later.
  
  if ( exists $self->{dn} ){
    # In mailing lists, dn has only "cn=someone@example.com"; no mail=
    if ( $self->{dn}[0] =~ /cn=(.*)/ && $self->{dn}[0] !~ /mail=/ ){
      $self->{cn}[0] = _normalise( $1 );
      print "Found group name '$self->{cn}[0] \n" if $debug;
      $self->{dn}[0] = "cn=$self->{cn}[0],ou=$our_ou,$our_prefix";
    }
    elsif ( $self->{dn}[0] =~ /cn=(.*),mail=/ ){
      my $cn = $1;
      # Some cn's are like "Joe Blatz,blats@example.com"
      if ( my @addrs = Email::Address->parse( $cn ) ){
        my $email = $addrs[0]->original();
        print "Removing ',$email' from cn:'$cn' \n" if $debug;
        $cn =~ s/,$email//;
        # Use it for email unless there already is one
        $self->{mail}[0] = $email unless exists $self->{mail} && $self->{mail}[0];
      }
      $cn = _normalise( $cn );
      if ( $cn =~ /^(.*),(.*)$/ ){   # Name is of form "Blatz, Joe"
        print "Reversing name parts in '$cn'\n";
        $self->{givenName}[0] = _normalise($2);
        $self->{sn}[0] = _normalise($1);
        $cn = "$self->{givenName}[0] $self->{sn}[0]";
        print "Givenname:'$self->{givenName}[0]', sn:'$self->{sn}[0]', cn:'$cn' \n";
      }
      $self->{cn}[0] = $cn;
      $self->{dn}[0] = "cn=$self->{cn}[0],ou=$our_ou,$our_prefix";
      print "Edited dn: '$self->{dn}[0]'\n" if $debug;
    }
    elsif( $self->{dn}[0] =~ /dn: mail=(.*)/ ){
      print "dn has 'mail' but no 'cn=' attribute.\n" if $debug;
    }
  }
  else{
    print "No dn ------------\n$text\n      ------------\n" if $debug;
    return;
  }

  # -- modifytimestamp won't be loaded, but convert it to an epoch number
  if (  exists $self->{'modifytimestamp'} ){
    if ( $self->{'modifytimestamp'}[0] =~ /(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})(.*)/ ){
      my ( $year, $mth, $day, $hh, $mm, $ss, $tz ) = ( $1, $2, $3, $4, $5, $6, $7 );
      $self->{timestamp} = "$year-$mth-$day $hh:$mm:$ss";
      printf "Modify timestamp '%s' converted to '%s' \n",
        $self->{'modifytimestamp'}, $self->{timestamp} if $debug;
    }
    delete $self->{'modifytimestamp'};
  }
  else{
    $self->{timestamp} = 'Not known';
  }

  # -- mozillaUseHtmlMail needs to be TRUE or FALSE
  if ( exists $self->{'mozillaUseHtmlMail'} ){
    $self->{'mozillaUseHtmlMail'}[0] = $self->{'mozillaUseHtmlMail'}? 'TRUE' : 'FALSE';
    print "Changed 'mozillaUseHtmlMail' to $self->{'mozillaUseHtmlMail'}[0]\n" if $debug;
  }

  # Slight differences between a "person" stanza and a "groupOfNames" one
  if ( belongs_to_class( $self, 'groupOfNames' ) ){   
    # -- Reformat "member" entries
    unless ( $self->{member} ){
      printf "List '%s' has no members; ignoring it\n", $self->{cn}[0];
      return;
    }
    print "Reformatting 'member' lines \n" if $debug;
    for my $mno( 0 .. $#{$self->{member}} ){
      print "Reformatting member '$self->{member}[$mno]'\n" if $debug;
      $self->{member}[$mno] =~ s/ ,/,/g;  # Tbird puts a space before ",cn=" for some reason
      my $msgcn;
      if ( $self->{member}[$mno] =~ /^mail=(.*),?/ ){
        $msgcn = _cn_from_email( $1, $debug );
      }
      else {
        $msgcn = _cn_from_email( $self->{member}[$mno], $debug );
      }
      print "... cn='$msgcn' \n" if $debug;
      $self->{member}[$mno] = "cn=$msgcn,ou=$our_ou,$our_prefix";
    }
    # -- use cn as "description" also
    $self->{description}[0] = $self->{cn}[0];
  }
  
  else{  # Person
    # -- Ensure correct email formatting
    my @addresses = Email::Address->parse( @{$self->{mail}} );  # Email::Address objects
    unless ( @addresses ){
      print "No email found in entry. dn='$self->{dn}[0]'\n";
      return;
    }
    @{$self->{mail}} = map{ $_->format() } @addresses;
    printf "Email address(es): %s\n", join( ',', @{$self->{mail}} ) if $debug;
    
    # -- Now use first email as cn if we don't have one
    unless ( $self->{cn} ){
      $self->{cn}[0] = _cn_from_email( $self->{mail}[0], $debug );
      print "Missing cn; used '$self->{cn}[0]'\n" if $debug;
      $self->{dn}[0] = "cn=$self->{cn}[0],ou=$our_ou,$our_prefix";
    }

    # -- Work out surname and given name if not present
    if ( ! ($self->{sn} && $self->{givenName}) ){
      my @name_parts = split( /\s+/,$self->{cn}[0] );
      if ( @name_parts > 1 ){  # We can choose given and sn
        $self->{sn} = [ pop @name_parts ];
        $self->{givenName} = [ join(" ",@name_parts) ];
        print "Calculated givenName='$self->{givenName}[0]', sn='$self->{sn}[0]'\n" if $debug;
      }
    }
    
    # -- Must have a sn: entry
    unless ( exists $self->{'sn'} ){
      if ( exists $self->{'givenName'}[0] ){
        $self->{'sn'}[0] = $self->{'givenName'}[0];
      }
      elsif ( exists $self->{'cn'}[0] ){
        $self->{'sn'}[0] = $self->{'cn'}[0];
      }
      else{
        $self->{'sn'}[0] = $self->{mail}[0];
      }
      print "Added sn: $self->{'sn'}[0]\n" if $debug;
    }
  }
  
  # -- Remove quotes, leading/trailing spaces and trailing commas from sn, givenName, cn, dn
  for ( qw(sn givenName cn dn) ) {
    next unless $self->{$_}[0];
    $self->{$_}[0] =~ s/["']//g;
    $self->{$_}[0] =~ s/^\s*|\s*$//g;
    $self->{$_}[0] =~ s/,$//;
  }
  
  
  printf "Entry created dn='%s', mail=%s \n", $self->{dn}[0], $self->{mail}[0]//'none'
    if $debug;
  
  bless $self, $class;
  return $self;
}

sub attribute_count{
  return scalar keys %{$_[0]};
}

sub cn{
  my $self = shift;
  $self->{cn}[0] = $_[0] if @_;
  return $self->{cn}[0];  
}

sub dn{
  my $self = shift;
  $self->{dn}[0] = $_[0] if @_;
  return $self->{dn}[0];
}

sub email{
  # Returns or sets the list of email addresses
  #   @new_list = $entry->email( @list_of_addresses )
  my $self = shift;
  if ( @_){
    my @new_email = @_;
    $self->{mail} = \@new_email;  
  }
  return @{$self->{mail}}
}

sub belongs_to_class{
  # Returns TRUE if the stanza contains "objectclass: xxxx", where 'xxxx' is the
  # supplied parameter
  my ( $self, $o_class ) = @_;
  return 1 if $o_class && grep /^${o_class}$/, @{$self->{objectclass}};
}

sub members{
  my $self = shift;
  return unless belongs_to_class( $self, 'groupOfNames' );
  return @{$self->{member}};
}

sub SecondEmail{
  # Optionally sets and then return the 'mozillaSecondEmail' attribute
  my $self = shift;
  $self->{mozillaSecondEmail}[0] = $_[0] if @_;
  return $self->{mozillaSecondEmail}[0];
}
sub show_dups{
  # Compares two entry objects.
  #  $text = $entry->show_dups( $other_entry )
  my ( $self, $other ) = @_;
  croak "[LDIFStanza-show_dups] invalid call" unless $other && ref($other) eq 'LDIFStanza';
  my %fields;
  $fields{$_}=1 for keys %$self;
  $fields{$_}=1 for keys %$other;
  my $txt = sprintf "$self->{dn}[0]\t\tPrevious (%s)\tCurrent (%s)\n", $self->{timestamp}, $other->{timestamp};
  for my $f ( sort keys %fields ){
    next if $f eq 'objectclass' || $f eq 'original_text' || $f eq 'start_line' || $f eq 'timestamp'
	|| $f eq 'dn';
    my $line = "\t$f\t";
    if (  exists $self->{$f} && defined $self->{$f} ){
      if ( exists $other->{$f} && defined $other->{$f} ){
	next if $self->{$f}[0] eq $other->{$f}[0];
	$line .= "$self->{$f}[0]\t$other->{$f}[0]";
      }
      else{
	$line .= "$self->{$f}[0]\t-"
      }
    }
    elsif( exists $other->{$f} && defined $other->{$f} ){
      $line .= "-\t$other->{$f}[0]"
    }
    $txt .= "$line\n";
  }
  return $txt;
}

sub start_line{
  my ( $self, $start_line ) = @_;
  $self->{start_line} = $start_line if defined $start_line;
  return $self->{start_line};
}

sub text{
  my $self = shift;
  my $txt = "dn: $self->{dn}[0]\n";
  for my $k ( sort keys %$self ){
    next unless ref($self->{$k}) eq 'ARRAY';  # All the 'internal' elements are scalars
    next if $k eq 'dn';                       # Done already
    $txt .= "$k: $_\n" for @{$self->{$k}};
  }
  return $txt;
}

sub _cn_from_email{
  # Helper function to extract a cn from an email address
  my $text = shift or return;
  my $debug = shift || 0;
  my $cn;
  if ( $text =~ /cn=([^,]*),?/ ){
    return $1;
  }
  if ( my @list = Email::Address->parse( $text ) ){
    # Use email address, but needs editing
    my $email = $list[0]->format();
    $email =~ s/^["']|["']$//g;  # Remove leading/trailing quotes
    $email =~ s/\\//g;           # Un-escape any other quotes
    if (my @extracted = extract_delimited( $email, q("') )){
      if ( @extracted ){
        if ( $extracted[0] ){  # Found a name
          $cn = $extracted[0];
          $cn =~ s/^["']|["']$//g;  # It still had its delimiters
          print "[_cn_from_email] found name '$cn' \n" if $debug;
        }
        elsif ( $extracted[1] ){            # Remainder will have to do
          $extracted[1] =~ s/^[\s"']*|[\s"']*$//g;
          $cn = $extracted[1];
          print "[_cn_from_email] using residual text '$cn' \n" if $debug;
        }
      }
    } 
  }
  else{
    print "[_cn_from_email] Cannot find cn, returning '$cn' \n" if $debug;
    $cn = $text;
  }
  $cn =~ s/^\s*|\s*$//g;   # Remove any lingering leading/trailing space
  return $cn;
}

sub _normalise{
  # Remove leading and trailing spaces, multiple spaces and escapes
  my $text = shift or return;
  my $debug = shift;
  print "Normalising '$text' \n" if $debug;
#  $text =~ s/["']//g;
  $text =~ s/\\//g;
  $text =~ s/^[\s"']*|[\s"']*$//g;
  $text =~ s/  / /g;
  print "... '$text' \n" if $debug;
  return $text;
}

package main;

use File::Basename;
use Getopt::Long;
use List::Util qw(uniq);
use Pod::Usage;

our $VERSION = "0.009";

# ---------------------------------------------------------------------
# Process flags and parameters
# ---------------------------------------------------------------------
my ( $ou, $prefix, $out_file, $dups_file, $delete_output);
my ( $opt_h, $opt_V, $debug, $flg_help ) = ( 0, 0, 0, 0 );
GetOptions ( 'ou=s'       => \$ou,
	     'prefix=s'   => \$prefix,
	     'output:s'   => \$out_file,
	     'dups=s'     => \$dups_file,
	     'x'          => \$delete_output,
             'h|help'     => \$flg_help,
	     'h'          => \$opt_h,
	     'V'          => \$opt_V,
	     'z'          => \$debug )
  or pod2usage( -verbose=>0 ); 

print "$0 version $VERSION\n" and exit if $opt_V;
pod2usage( -verbose => 2 ) if $flg_help;

pod2usage( -message => "Must specify both 'ou' and 'prefix'" )
    unless $ou && $prefix;

my $input_file = shift or pod2usage( -message => "Must specify an input file" );
die "File '$input_file' not found\n" unless -e $input_file;
my ( $name, $path, $ext ) = fileparse( $input_file, qr/\.[^.]*/);
$out_file = "$path$name-transformed$ext" unless $out_file;
$dups_file = "$path$name-dups.csv" unless $dups_file;
print "Output file is '$out_file'\n" if $debug;
die "Output file '$out_file' exists; not safe to continue\n" if -e $out_file && !$delete_output;

my %person;  # Holds person objects
my %lists;   # Holds groupOfNames objects

open my $INPUT, '<', $input_file  or die "'$input_file': $!\n";

# Remove any existing dups file with this name, and arrange not to open it unnecessarily
unlink $dups_file if -e $dups_file;
my $DUPS;
my $dups_file_is_open = 0;   # Don't open it unless we need to write to it

my $entry_text;
my $version_line = <$INPUT>;
chomp $version_line;
$version_line =~ s/\s*$//;
die "Invalid version line '$version_line'\n" unless $version_line eq 'version: 1';

my $line_count = 1;
my $entry_count = 0;
my $dup_count = 0;
my $start_line = 0;
my $previous_line = '';

# -- parse the input LDIF
while( my $line = <$INPUT> ){
  chomp $line;
  $line_count++;
  printf "[%4d] %s\n", $line_count, $line if $debug;
  if ( $line ) {
    unless ( $previous_line ){
      print "Setting start line to $line_count \n" if $debug;
      $start_line = $line_count
    }
    $entry_text .= "$line\n"; 
  }
  else {
    generate_entry( $entry_text, $start_line );
    $entry_text = '';
  }
  $previous_line = $line;
}
if ( $entry_text ){  # Do the last one
  generate_entry( $entry_text, $start_line );
}
close $INPUT;

# -- Check that all addresses in lists are elsewhere in the ABook
foreach my $group ( keys %lists ){
  print "Checking members of group '$group' \n" if $debug;
  foreach my $mem( $lists{$group}->members() ){
    print "... '$mem'\n" if $debug;
    if ( $mem =~ /\@/ ){
      print "'$mem' looks like bare email address; not checking. \n" if $debug;
    }
    else{
      print "Warning: member '$mem' in list '$group' not found in this address book\n"
        unless exists $person{ $mem };      
    }
  }
}

open my $OUT, '>', $out_file or die "'$out_file': $!\n";
print $OUT "version: 1\n\n";
printf $OUT "# LDIF created from %s by %s version %.3f\n\n", $input_file, $0, $VERSION;
for my $ent ( sort keys %lists ){
  print $OUT $lists{$ent}->text(),"\n";
}
for my $ent ( sort keys %person ){
  print $OUT $person{$ent}->text(),"\n";
}

close $OUT;
close $DUPS if $dups_file_is_open;

printf "%s written. Lines in %d, entries %d\n", $out_file, $line_count,
  (scalar keys %person) + scalar keys %lists;

sub generate_entry{
  # Takes parsed text and reformats into an ldif stanza. Handles duplicates,
  # modifying the original ldif in place, adding in new email addresses. Adds
  # to %ld hash.
  my ( $entry_text, $start_line ) = @_;
  die "[generate_entry] no start line" unless defined $start_line;
  return unless $entry_text;  # First iteration has none
  print "Creating LDIFStanza from\n$entry_text-----------------------[starting $start_line]\n" if $debug;
  my $entry = LDIFStanza->new( $entry_text, $ou, $prefix, $debug );
  if ( $entry ){
    my $this_dn = $entry->dn();
    $entry->start_line( $start_line );
    
    if ( $entry->belongs_to_class( 'groupOfNames' ) ){
      if( exists $lists{ $this_dn } ){
        printf "Duplicate mailing list '%s' at line %d - ignored\n", $this_dn, $start_line;
        return;        
      }
      $lists{ $entry->dn() } = $entry;
      $entry_count++;
    }
    
    else{

      # Handle duplicate person records
      # If multiple email addresses are found, put them all in as 'mail:' attributes in
      # the original entry; TBird won't recognise anything other than the first, though
      # Put the second email in priority list in the 'mozillaSecondEmail' attribute of
      # the original dn.
      
      if ( exists $person{$this_dn} ){
        if ( $entry->belongs_to_class( 'groupOfNames') ){
          printf "Duplicate mailing list '%s' at line %d; ignoring\n", $this_dn, $start_line;
          return;
        }
        else{
          printf "Duplicate dn '%s' at line %d\n", $this_dn, $start_line;
        	if ( my @diffs = compare_entries( $entry, $person{$this_dn} ) ){
            # If the mail attributes are different we need to handle those specially
            if ( grep /^mail$/, @diffs ){
              
              # Gather the various email addresses in order
              my @mail_addrs;  # This will be the "mail" attributes for the first (primary) dn
              push @mail_addrs, $entry->{mail}[0];              # 1st place is the new 'mail' entry
              push @mail_addrs, $person{$this_dn}->{mail}[0];       # 2nd is the first existing 'mail' entry
              push @mail_addrs, $person{$this_dn}->SecondEmail()
                if $person{$this_dn}->SecondEmail();                # 3rd is any existing 'mozillaSecondEmail'
              push @mail_addrs, @{$entry->{mail}}[1..$#{$entry->{mail}}];          # Then the remaining new 'mail' values
              push @mail_addrs, @{$person{$this_dn}->{mail}}[1..$#{$person{$this_dn}->{mail}}]; # Last the old mail values
              @mail_addrs = uniq @mail_addrs;                   # Weed out duplicates (preserves order)
              
              # Update the "mail" and "SecondEmail" attributes of the original dn
              print "Updating email addresses to ", join(',',@mail_addrs), "\n" if $debug;
              $person{$this_dn}->email( @mail_addrs );
              if ( $mail_addrs[1] ){
                print "Adding $mail_addrs[1] as mozillaSecondEmail\n" if $debug;
                $person{$this_dn}->SecondEmail( $mail_addrs[1] );            
              }
            }
        	}
        	else{
        	  print "   Identical to the first; ignoring it.\n";
        	  return;
        	}
        
        	# Work out next duplicate dn and add stanza to %ld
        	my $dup_no = 1;
        	my $dup_dn = sprintf "cn=%s[%d],ou=%s,%s",$person{$this_dn}->cn(), $dup_no, $ou, $prefix;
        	while ( exists $person{$dup_dn} ){
        	  $dup_dn = sprintf "cn=%s[%d],ou=%s,%s",$person{$this_dn}->cn(), ++$dup_no, $ou, $prefix;
        	}
        	print "Setting dn of duplicate entry to '$dup_dn'\n" if $debug;
        	$entry->dn( $dup_dn );
        	$this_dn = $dup_dn;
        }
      }
      $person{$this_dn} = $entry;
    }
  }
  else{
    print "Invalid or empty ldif text starting on line $start_line\n";
    print "--------------------------------------\n$entry_text--------------------------------------\n";
  }
}

sub compare_entries{
  # Comparse two LDIFStanza objects and returns a list of the LDAP fields that differ
  # between them. 
  my ( $one, $two ) = @_;
  die "[compare_entries] invalid call" unless $two && ref($one) eq 'LDIFStanza'
    && ref($two) eq 'LDIFStanza';
  my @diffs;
  for my $key ( keys %$one ){
    next unless ref $one->{$key} eq 'ARRAY';  # all ldap fields are arrays
    next if $key eq 'modifytimestamp';
    for my $val ( uniq (@{$one->{$key}}, @{$two->{$key}}) ){  # All used values
      unless ( grep /$val/, @{$one->{$key}} && grep /$val/, @{$two->{$key}} ){
        push @diffs, $key;
        last;
      }
    }
  }
  return @diffs;
}