#!/usr/bin/perl

use warnings;
use strict;
use Encode;
use Encode::HanExtra;

# For some reason, The following works where open(FH, "<:encoding(big5)", $filename) does not
#use encoding qw(big5);

sub split_big5_from_fh_sgml_into_files {
  my ($infile, $targetdir) = @_;

  # Slurp mode
  local ($/);
  my $sgml = <$infile>;
  while ($sgml =~ /<DOC docid="(.*?)".*?>(.*?)<\/DOC>/sgi) {
    my $docid = $1;
    my $body = $2;
    $body =~ s/<.*?>//g;
    open(my $outfile, ">:utf8", "$targetdir/$docid") or die "Could not open $targetdir/$docid: $!";
    print $outfile $body;
    close $outfile;
  }
  close $infile;
}

sub split_big5_sgml_into_files {
  my ($filename, $targetdir) = @_;

  # Slurp mode
  local ($/);
  open(my $infile, "<:encoding(big5plus)", $filename) or die "Could not open $filename: $!";
  my $sgml = <$infile>;
  close $infile;
  while ($sgml =~ /<DOC docid="(.*?)".*?>(.*?)<\/DOC>/sgi) {
    my $docid = $1;
    my $body = $2;
    $body =~ s/<.*?>//g;
    open(my $outfile, ">:utf8", "$targetdir/$docid") or die "Could not open $targetdir/$docid: $!";
    print $outfile $body;
    close $outfile;
  }
  close $infile;
}

sub split_sgml_into_files {
  my ($filename, $targetdir, $encoding) = @_;

  # Slurp mode
  local ($/);
  open(my $infile, "<:encoding($encoding)", $filename) or die "Could not open $filename: $!";
  my $sgml = <$infile>;
  close $infile;
  while ($sgml =~ /<DOC docid="(.*?)".*?>(.*?)<\/DOC>/sgi) {
    my $docid = $1;
    my $body = $2;
    $body =~ s/<.*?>//g;
    open(my $outfile, ">:utf8", "$targetdir/$docid") or die "Could not open $targetdir/$docid: $!";
    print $outfile $body;
    close $outfile;
  }
  close $infile;
}

unless (@ARGV == 2) {
  print <<"ENDUSAGE";
Usage: perl $0 <sourcedir> <destdir>

To use this program, first obtain the Chinese collection from LDC
<http://www.ldc.upenn.edu/>; it is Catalog number LDC2005T10.

Uncompress and untar the data; the resulting directory is <sourcedir>.

<destdir> is the directory where you want the documents to be placed;
it must not exist prior to running this script.

This program was tested under BSD Unix on a Mac (Darwin 10.8.0) and
under Gnu/Linux 2.6; it has not been verified on other platforms.

ENDUSAGE
  exit 0;
}

my ($sourcedir, $destdir) = @ARGV;
$sourcedir =~ s/\/+$//;
die "Directory $sourcedir does not exist" unless -d $sourcedir;
$destdir =~ s/\/+$//;
die "Destination directory $destdir already exists" if -e $destdir;
mkdir($destdir) or die "Could not create directory $destdir: $!";
mkdir("$destdir/en") or die "Could not create directory $destdir/en: $!";
mkdir("$destdir/zh") or die "Could not create directory $destdir/zh: $!";

foreach my $efile (<$sourcedir/data/English/*.sgm>) {
  &split_sgml_into_files($efile, "$destdir/en", "latin1");
}
foreach my $zfile (<$sourcedir/data/Chinese/*.sgm>) {
  &split_big5_sgml_into_files($zfile, "$destdir/zh");
}

#&split_big5_from_fh_sgml_into_files(*STDIN, "$destdir/zh");

# big5-eten "\x8E" does not map to Unicode at BuildLDCChineseParallelCorpus.pl line 15.
# Done file /export/common/data/corpora/LDC/LDC2005T10/data/Chinese/SINO197902.sgm

1;


# ## -----------------------------------------------------------------
# ## LDC2005T10 (English / Chinese)

# ## (1) Create Raw Data
# cp data from /export/common/data/corpora/LDC/LDC2005T10 to /export/projects/mlentlnk/langs/zh/raw

# # English half
# foreach file (*.sgm)
#  perl -Mencoding=latin1 -e '$text = ""; while (<>) {$text .= $_;} END {@m = $text =~ /<DOC docid.*?>.*?<\/DOC>/msgi; foreach my $doc (@m) {$doc =~ m/docid="(.*?)"/i; $docid=$1; $doc =~ s/<.*?>//gi; open FH, ">$docid"; print FH $doc; close FH;} }' $file
# end

# # Chinese half
# foreach file (*.sgm)
#  perl -Mencoding=big5 -e '$text = ""; while (<>) {$text .= $_;} END {@m = $text =~ /<DOC docid.*?>.*?<\/DOC>/msgi; foreach my $doc (@m) {$doc =~ m/docid="(.*?)"/i; $docid=$1; $doc =~ s/<.*?>//gi; open (FH, ">:utf8", $docid); print FH $doc; close FH;} }' $file
# end
