#!/usr/bin/perl -w

#
#  The Intltool Message Merger
#
#  Copyright (C) 2000 Free Software Foundation.
#  Copyright (C) 2000, 2001 Eazel, Inc
#
#  Intltool is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License as
#  published by the Free Software Foundation; either version 2 of the
#  License, or (at your option) any later version.
#
#  Intltool is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#  As a special exception to the GNU General Public License, if you
#  distribute this file as part of a program that contains a
#  configuration script generated by Autoconf, you may include it under
#  the same distribution terms that you use for the rest of that program.
#
#  Authors:  Maciej Stachowiak <mjs@noisehavoc.org>
#            Kenneth Christiansen <kenneth@gnu.org>
#            Darin Adler <darin@bentspoon.com>
#
#  Proper XML UTF-8ification written by Cyrille Chepelov <chepelov@calixo.net>
#

## Release information
my $PROGRAM      = "intltool-merge";
my $PACKAGE      = "intltool";
my $VERSION      = "0.12";

## Script options - Enable by setting value to 1
my $ENABLE_XML   = "1";

## Loaded modules
use strict; 
use File::Basename;
use Getopt::Long;

## Scalars used by the option stuff
my $HELP_ARG = "0";
my $VERSION_ARG = "0";
my $OAF_STYLE_ARG = "0";
my $XML_STYLE_ARG = "0";
my $KEYS_STYLE_ARG = "0";
my $DESKTOP_STYLE_ARG = "0";
my $QUIET_ARG = "0";
my $PASS_THROUGH_ARG = "0";
my $UTF8_ARG = "0";

## Handle options
GetOptions (
	    "help|h" => \$HELP_ARG,
	    "version|v" => \$VERSION_ARG,
            "quiet|q" => \$QUIET_ARG,
	    "oaf-style|o" => \$OAF_STYLE_ARG,
	    "xml-style|x" => \$XML_STYLE_ARG,
	    "keys-style|k" => \$KEYS_STYLE_ARG,
	    "desktop-style|d" => \$DESKTOP_STYLE_ARG,
            "pass-through|p" => \$PASS_THROUGH_ARG,
            "utf8|u" => \$UTF8_ARG
	    ) or &error;


my $PO_DIR;
my $FILE;
my $OUTFILE;

my %po_files_by_lang = ();
my %translations = ();

# Use this instead of \w for XML files to handle more possible characters.
my $w = "[-A-Za-z0-9._:]";

# XML quoted string contents
my $q = "[^\\\"]*";

&split_on_argument;


## Check for options. 
## This section will check for the different options.

sub split_on_argument {

    if ($VERSION_ARG) {
	&version;
    } elsif ($HELP_ARG) {
	&help;
    } elsif ($OAF_STYLE_ARG && @ARGV > 2) {
	&place_normal;
	&message;
	&preparation;
	&oaf_merge_translations;
    } elsif ($XML_STYLE_ARG && @ARGV > 2) {
        &utf8_sanity_check;
	&place_normal;
	&message;
	&preparation;
	&xml_merge_translations;
    } elsif ($KEYS_STYLE_ARG && @ARGV > 2) {
        &utf8_sanity_check;
        &place_normal;
        &message;
        &preparation;
        &keys_merge_translations;
    } elsif ($DESKTOP_STYLE_ARG && @ARGV > 2) {
        &place_normal;
        &message;
        &preparation;
        &desktop_merge_translations;
    } else {
	&help;
    }
}

sub utf8_sanity_check {
    if (!$UTF8_ARG) {
        if (!$PASS_THROUGH_ARG) {
            $PASS_THROUGH_ARG="1";
        }
    }
}

sub place_normal {
    $PO_DIR = $ARGV[0];
    $FILE = $ARGV[1];
    $OUTFILE = $ARGV[2];
}


## Sub for printing release information
sub version{
    print "${PROGRAM} (${PACKAGE}) ${VERSION}\n";
    print "Written by Maciej Stachowiak and Kenneth Christiansen, 2000.\n\n";
    print "Copyright (C) 2000 Free Software Foundation, Inc.\n";
    print "Copyright (C) 2000, 2001 Eazel, Inc.\n";
    print "This is free software; see the source for copying conditions.  There is NO\n";
    print "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n";
    exit;
}

## Sub for printing usage information
sub help{
    print "Usage: ${PROGRAM} [OPTIONS] PO_DIRECTORY FILENAME OUTPUT_FILE\n";
    print "Generates an xml file that includes translated versions of some attributes,\n";
    print "from an untranslated source and a po directory that includes translations.\n";
    print "  -v, --version                shows the version\n";
    print "  -h, --help                   shows this help page\n";
    print "  -q, --quiet                  quiet mode\n";
    print "  -o, --oaf-style              includes translations in the oaf style\n";
    print "  -x, --xml-style              includes translations in the xml style\n";
    print "  -k, --keys-style             includes translations in the keys style\n";
    print "  -d, --desktop-style          includes translations in the desktop style\n";
    print "  -u, --utf8                   convert all strings to UTF-8 before merging\n";
    print "  -p, --pass-through           use strings as found in .po files, without\n";
    print "                               conversion (STRONGLY unrecommended with -x)\n";
    print "\nReport bugs to bugzilla.gnome.org, module intltool or xml-i18n-tools-list\@gnome.org>\n";
    exit;
}


## Sub for printing error messages
sub error{
    print "Try `${PROGRAM} --help' for more information.\n";
    exit;
}


sub message {
    print "Merging translations into $OUTFILE.\n" unless $QUIET_ARG;
}


sub preparation {
   &gather_po_files;
   &create_translation_database;
}

# General-purpose code for looking up translations in .po files

sub gather_po_files
{
    my @po_files = glob("${PO_DIR}/*.po");
    my @languages = map &po_file2lang, @po_files;
    for my $lang (@languages) {
	$po_files_by_lang{$lang} = shift (@po_files);
    }
}

sub po_file2lang
{
    my $tmp = $_; 
    $tmp =~ s/^.*\/(.*)\.po$/$1/; 
    return $tmp; 
}

sub get_po_encoding
{
    my ($in_po_file) = @_;
    my $encoding = "";

    open IN_PO_FILE, $in_po_file or die;
    while (<IN_PO_FILE>) {
        ## example: "Content-Type: text/plain; charset=ISO-8859-1\n"
        if (/Content-Type\:.*charset=([-a-zA-Z0-9]+)\\n/) {
            $encoding = $1; 
            last;
        }
    }
    close IN_PO_FILE;

    if (!$encoding) {
        print "Warning: no encoding found in $in_po_file. Assuming ISO-8859-1\n";
        $encoding = "ISO-8859-1";
    }
    return $encoding
}

sub create_translation_database
{
    for my $lang (keys %po_files_by_lang) {
    	my $po_file = $po_files_by_lang{$lang};

        if ($UTF8_ARG) {
            my $encoding = get_po_encoding($po_file);
            open PO_FILE, "iconv -f $encoding -t UTF-8 $po_file|";	
        } else {
            open PO_FILE, "<$po_file";	
        }

	my $inmsgid = 0;
	my $inmsgstr = 0;
	my $msgid = "";
	my $msgstr = "";
        while (<PO_FILE>) {
            if (/^msgid "((\\.|[^\\])*)"/ ) {
		$translations{$lang}{$msgid} = $msgstr if $inmsgstr && $msgstr && $msgstr ne $msgid;
		$msgid = unescape_po_string($1);
		$inmsgid = 1;
		$inmsgstr = 0;
	    }
	    if (/^msgstr "((\\.|[^\\])*)"/) {
	        $msgstr = unescape_po_string($1);
		$inmsgstr = 1;
		$inmsgid = 0;
	    }
	    if (/^"((\\.|[^\\])*)"/) {
	        $msgid .= unescape_po_string($1) if $inmsgid;
	        $msgstr .= unescape_po_string($1) if $inmsgstr;
	    }
	}
	$translations{$lang}{$msgid} = $msgstr if $inmsgstr && $msgstr && $msgstr ne $msgid;
    }
}

sub unescape_one_sequence
{
    my ($sequence) = @_;

    return "\\" if $sequence eq "\\\\";
    return "\"" if $sequence eq "\\\"";

    # gettext also handles \n, \t, \b, \r, \f, \v, \a, \xxx (octal),
    # \xXX (hex) and has a comment saying they want to handle \u and \U.

    return $sequence;
}

sub unescape_po_string
{
    my ($string) = @_;

    $string =~ s/(\\.)/unescape_one_sequence($1)/eg;

    return $string;
}

sub entity_decode
{
    local ($_) = @_;

    s/&apos;/'/g; # '
    s/&gt;/>/g;
    s/&lt;/</g;
    s/&quot;/"/g; # "
    s/&amp;/&/g;

    return $_;
}

sub entity_encode
{
    my ($pre_encoded) = @_;

    my @list_of_chars = unpack ('C*', $pre_encoded);

    if ($PASS_THROUGH_ARG) {
        return join ('', map (&entity_encode_int_even_high_bit, @list_of_chars));
    } else {
        return join ('', map (&entity_encode_int_minimalist, @list_of_chars));
    }
}

sub entity_encode_int_minimalist
{
    return "&quot;" if $_ == 34;
    return "&amp;" if $_ == 38;
    return "&apos;" if $_ == 39;
    return "&lt;" if $_ == 60;
    return "&gt;" if $_ == 62;
    return chr $_;
}

sub entity_encode_int_even_high_bit
{
    if ($_ > 127 || $_ == 34 || $_ == 38 || $_ == 39 || $_ == 60 || $_ == 62) {
        # the ($_ > 127) should probably be removed
	return "&#" . $_ . ";"; 
    } else {
	return chr $_;
    }
}

sub entity_encoded_translation
{
    my ($lang, $string) = @_;

    my $translation = $translations{$lang}{$string};
    return $string if !$translation;
    return entity_encode($translation);
}

## XML/OAF-specific merge code

sub oaf_merge_translations
{
    my $source;

    {
       local $/; # slurp mode
       open INPUT, "<$FILE" or die "can't open $FILE: $!";
       $source = <INPUT>;
       close INPUT;
    }

    open OUTPUT, ">$OUTFILE" or die "can't open $OUTFILE: $!";

    while ($source =~ s|^(.*?)([ \t]*<\s*$w+\s+($w+\s*=\s*"$q"\s*)+/?>)([ \t]*\n)?||s) {
        print OUTPUT $1;

        my $node = $2 . "\n";

        my @strings = ();
        $_ = $node;
	while (s/(\s)_($w+\s*=\s*"($q)")/$1$2/s) {
             push @strings, entity_decode($3);
        }
	print OUTPUT;

	my %langs;
	for my $string (@strings) {
	    for my $lang (keys %translations) {
                $langs{$lang} = 1 if $translations{$lang}{$string};
	    }
	}
	
	for my $lang (sort keys %langs) {
	    $_ = $node;
	    s/(\sname\s*=\s*)"($q)"/$1"$2-$lang"/s;
	    s/(\s)_($w+\s*=\s*")($q)"/$1 . $2 . entity_encoded_translation($lang, $3) . '"'/seg;
	    print OUTPUT;
        }
    }

    print OUTPUT $source;

    close OUTPUT;
}


## XML (non-OAF) merge code

sub xml_merge_translations
{
    my $source;

    {
       local $/; # slurp mode
       open INPUT, "<$FILE" or die "can't open $FILE: $!";
       $source = <INPUT>;
       close INPUT;
    }

    open OUTPUT, ">$OUTFILE" or die;

    # FIXME: support attribute translations

    # Empty nodes never need translation, so unmark all of them.
    # For example, <_foo/> is just replaced by <foo/>.
    $source =~ s|<\s*_($w+)\s*/>|<$1/>|g;

    # Support for <_foo>blah</_foo> style translations.
    while ($source =~ s|^(.*?)([ \t]*)<\s*_($w+)\s*>(.*?)<\s*/_\3\s*>([ \t]*\n)?||s) {
        print OUTPUT $1;

        my $spaces = $2;
        my $tag = $3;
        my $string = $4;

	print OUTPUT "$spaces<$tag>$string</$tag>\n";

	$string =~ s/\s+/ /g;
	$string =~ s/^ //;
	$string =~ s/ $//;
	$string = entity_decode($string);

	for my $lang (sort keys %translations) {
	    my $translation = $translations{$lang}{$string};
	    next if !$translation;
	    $translation = entity_encode($translation);
	    print OUTPUT "$spaces<$tag xml:lang=\"$lang\">$translation</$tag>\n";
        }
    }

    print OUTPUT $source;

    close OUTPUT;
}

sub keys_merge_translations
{
    open INPUT, "<${FILE}" or die;
    open OUTPUT, ">${OUTFILE}" or die;

    while (<INPUT>) {
        if (s/^(\s*)_(\w+=(.*))/$1$2/)  {
	    my $string = $3;

            print OUTPUT;

	    my $non_translated_line = $_;

            for my $lang (sort keys %translations) {
                my $translation = $translations{$lang}{$string};
                next if !$translation;

                $_ = $non_translated_line;
		s/(\w+)=.*/[$lang]$1=$translation/;
                print OUTPUT;
            }
	} else {
            print OUTPUT;
        }
    }

    close OUTPUT;
    close INPUT;
}

sub desktop_merge_translations
{
    open INPUT, "<${FILE}" or die;
    open OUTPUT, ">${OUTFILE}" or die;

    while (<INPUT>) {
        if (s/^(\s*)_(\w+=(.*))/$1$2/)  {
	    my $string = $3;

            print OUTPUT;

	    my $non_translated_line = $_;

            for my $lang (sort keys %translations) {
                my $translation = $translations{$lang}{$string};
                next if !$translation;

                $_ = $non_translated_line;
                s/(\w+)=$string/${1}[$lang]=$translation/;
                print OUTPUT;
            }
	} else {
            print OUTPUT;
        }
    }

    close OUTPUT;
    close INPUT;
}
