#!/usr/bin/perl

use warnings;
use strict;

# Build 'aligned docs' for Project Syndicate (WMT10 news commentar) data, which
# was provided as large line/sentence-aligned files.)
#
# Run with no arguments for usage information
#
# Paul McNamee - 1/5/11
# Updates by James Mayfield 16 Sep 2011

my $prefix = "/export/projects/mlentlnk/";

my @langs = qw(de cz es fr);
my %tbl;
my %title;

sub loadLangFiles {
  my ($sourcedir, $lang) = @_;
  my $enfile = "$sourcedir/news-commentary10.$lang-en.en";
  my $xxfile = "$sourcedir/news-commentary10.$lang-en.$lang";

  open(my $ENFILE, "<:utf8", $enfile) or die "Could not open $enfile: $!";
  while (<$ENFILE>) {
      chomp;
      push(@{$tbl{"$lang-en"}}, $_);
  }
  close $ENFILE;

  open(my $XXFILE, "<:utf8", $xxfile) or die "Could not open $xxfile: $!";
  while (<$XXFILE>) {
      chomp;
      push(@{$tbl{"$lang-$lang"}}, $_);
  }
  close $XXFILE;
}

unless (@ARGV == 2) {
  print <<"ENDUSAGE";
Usage: perl $0 <sourcedir> <destdir>

To use this program, first download the Project Syndicate collection
from <http://www.statmt.org/wmt10/translation-task.html>; it appears
as "Parallel corpus training data" under the heading "Download."

Uncompress and untar the data; the resulting directory is <sourcedir>.

<destdir> is the directory where you want the documents to be placed;
it must not exist prior to running this script.

This program was tested under BSD Unix on a Mac (Darwin 10.8.0) and
under Gnu/Linux 2.6; it has not been verified on other platforms.

ENDUSAGE
  exit 0;
}

my ($sourcedir, $destdir) = @ARGV;
$sourcedir =~ s/\/+$//;
die "Directory $sourcedir does not exist" unless -d $sourcedir;
$destdir =~ s/\/+$//;
die "Destination directory $destdir already exists" if -e $destdir;
mkdir($destdir) or die "Could not create directory $destdir: $!";

# Load data
foreach my $lang (@langs) {
    &loadLangFiles($sourcedir, $lang);
}

# Fill title hashtable
my $lcnt = 0;
foreach my $line (@{$tbl{"de-en"}}) {
    my @words = split /\s+/, $line;
    if (exists $title{$line}) {
	print STDERR "Warning! Duplicate title: $line\n";
	$title{$line} = -1;
	$lcnt++;
    }
    else {
      $title{$line} = $lcnt if (($line =~ /^[A-Z]/) &&
				@words &&
				(@words <= 8) &&
				($line =~ /[a-zA-Z]/) &&
				($line !~ /[\.\?\!:;"]$/));
      $lcnt++;
    }
}

my $docid = 0;
my %docnos = ();
foreach my $lang (@langs) {
    my $olang = $lang;
    $olang = 'cs' if $lang eq 'cz';
    my $odir = "$destdir/$olang";
    unless (-d $odir) {
      mkdir $odir or die "Could not create directory $odir: $!";
    }
    my @edoc = ();
    my @xdoc = ();
    my $lineno = 0;

    foreach my $line (@{$tbl{"$lang-en"}}) {
	if (exists $title{$line} && $title{$line} >= 0 && @edoc > 0) {
	    # Found new doc boundary, write out old document
	    if (exists $title{$edoc[0]}) {
		my $docstr = sprintf("PS-%0*d", 7, $title{$edoc[0]});

		# English
		unless (-d "$odir/en") {
		  mkdir "$odir/en" or die "Could not create directory $odir/en: $!";
		}
		open(my $ENFH, ">:utf8", "$odir/en/$docstr") or die "Could not open $odir/en/$docstr: $!";
		foreach my $line (@edoc) {
		    print $ENFH "$line\n";
		}
		close $ENFH;

		# Foreign Lang
		unless (-d "$odir/$olang") {
		  mkdir "$odir/$olang" or die "Could not create directory $odir/$olang: $!";
		}
		open(my $XXFH, ">:utf8", "$odir/$olang/$docstr") or die "Could not open $odir/$olang/$docstr: $!";
		foreach my $line (@xdoc) {
		    print $XXFH "$line\n";
		}
		close $XXFH;
	    }

	    # Whether last section written or not, clear buf
	    @edoc=();
	    @xdoc=();
	}
	
	push @edoc, @{$tbl{"$lang-en"}}[$lineno];
	push @xdoc, @{$tbl{"$lang-$lang"}}[$lineno];
	$lineno++;
    }
}

1;
