#!/usr/bin/perl

use Getopt::Long;
use Term::ANSIColor qw(:constants);

my $G1=0.909091; #G2 is 1-$G1
my $ofile="syl_output";
my $ifile;
my @stack;
my (@g1sl, @g2sl);
my (%g1_t, %g1_b, %g1_m,%g2_t, %g2_b, %g2_m);
my (%g1_tp, %g1_bp, %g1_mp);
local *INPUT,*OUTPUT;
my $prior=1;
my $color=0;
my %optctl = (help => \$help, dl => \$prior, color => \$color);
&GetOptions(\%optctl,"color","dl=i","help");

printhelp() if($help==1);
process_arguments();

open (OUTPUT,">$ofile") or
    die ("Can not create output file $ofile !!!\n");
mess("Processing ...",1);
read_data();
make_3g_2g_1g(\@g1sl, \%g1_t, \%g1_b, \%g1_m);
make_3g_2g_1g(\@g2sl, \%g2_t, \%g2_b, \%g2_m);
compute_mon_p(\%g1_m, \%g1_mp);
compute_big_p(\%g1_b, \%g1_bp);
compute_tri_p(\%g1_t, \%g1_tp);
print_stat_file();
close(OUTPUT);
warn "\n(c) Dodo 2003,2004,2005\n\n";
exit(0);

sub print_stat_file
{
  print OUTPUT "***** MONO G1 *****\n"; print_h(\%g1_m,\%g1_mp); 
  print OUTPUT "***** BI G1 *****\n"; print_h(\%g1_b,\%g1_bp); 
  print OUTPUT "***** TRI G1 *****\n"; print_h(\%g1_t,\%g1_tp);
  print OUTPUT "***** MONO G2 *****\n"; print_h(\%g2_m); 
  print OUTPUT "***** BI G2 *****\n"; print_h(\%g2_b); 
  print OUTPUT "***** TRI G2 *****\n"; print_h(\%g2_t);
}

sub compute_tri_p
{
 my ($cnt,$prb)=@_;
 my $sylocc=0;
 foreach $s (keys %$cnt) { $sylocc+=$$cnt{$s};}
 mess("Num. of all trigrams in G1: $sylocc",1);
 my %fp;
 foreach $s (keys %$cnt)
  {
   ($f1,$f2,$r)=split("-",$s);
   $f="$f1-$f2";
   $fp{$f}+=$$cnt{$s};
  }
 foreach $s (keys %$cnt)
  {
   ($f1,$f2,$r)=split("-",$s);
   $f="$f1-$f2";
   $sprb=$$cnt{$s}/$fp{$f};
   $$prb{$s}=$sprb;
  }
}

sub compute_big_p
{
 my ($cnt,$prb)=@_;
 my $sylocc=0;
 foreach $s (keys %$cnt) { $sylocc+=$$cnt{$s};}
 mess("Num. of all bigrams in G1: $sylocc",1);
 my %fp;
 foreach $s (keys %$cnt)
  {
   ($f,$r)=split("-",$s);
   $fp{$f}+=$$cnt{$s};
  }
 foreach $s (keys %$cnt)
  {
   ($f,$r)=split("-",$s);
   $sprb=$$cnt{$s}/$fp{$f};
   $$prb{$s}=$sprb;
  }
}

sub compute_mon_p
{
 my ($cnt,$prb)=@_;
 my $sylocc=0;
 foreach $s (keys %$cnt) { $sylocc+=$$cnt{$s};}
 mess("\nNum. of all monograms in G1: $sylocc",1);
 my $cp=0;
 foreach $s (keys %$cnt)
  {
   $sprb=$$cnt{$s}/$sylocc;
   $$prb{$s}=$sprb;
   $cp+=$sprb;
  }
 mess("Check prob. for monograms: $cp",1);
}

sub make_3g_2g_1g
{
 my ($all,$tr,$bi,$mo)=@_;
 my $num=@$all;
 mess("Creating T,B,M for $num words ...",1);
 foreach $w (@$all)
 {
  @syl=split("-",$w);
  my $snum=@syl;
  mess("Processing $w  \t syls: $snum",2);
  # Monograms
  foreach $s (@syl)
  {
   $$mo{$s}++; 
  }
  # Bigrams
  my ($first,$last)=(0,1);
  while($last<$snum)
  {
   my $big=$syl[$first]."-".$syl[$last]; 
   $$bi{$big}++; 
   $first++; $last++;
  }
  # Trigrams
  my ($first,$last)=(0,2);
  while($last<$snum)
  {
   my $tri=$syl[$first]."-".$syl[$first+1]."-".$syl[$last]; 
   $$tr{$tri}++; 
   $first++; $last++;
  }
 }
 my $monum=keys(%$mo);
 my $binum=keys(%$bi);
 my $trnum=keys(%$tr);
 mess("Stats - MO: $monum\t BI: $binum\t TR: $trnum",1);
}

sub print_h
{
 my $h=$_[0];
 my $p=$_[1];
 foreach $k (sort keys %$h)
 {
  print OUTPUT "${k}::$$h{$k}::$$p{$k}\n";
 }
}

sub read_data
{
 open (INPUT,"< $ifile") or
    die ("Can not open input file $ifile !!!\n");
 my @all;
 while(<INPUT>)
 {
  chomp;
  s/ //g;
  push @all,$_;
 }
 close(INPUT);
 my $num=@all;
 mess("Words count: \t$num",1);
 my $num1=int($G1*$num);
 my $num2=$num-$num1;
 my $i;
 for($i=0;$i<$num1;$i++)
  {$g1sl[$i]=$all[$i];}
 for($j=0;$j<$num2;$j++,$i++)
  {$g2sl[$j]=$all[$i];}
 $num1=@g1sl;
 $num2=@g2sl;
 mess("Words count G1: \t$num1",1);
 mess("Words count G2: \t$num2",1);
}

sub process_arguments
  {
    if(@ARGV==0)
      {
	$ifile="-";
	mess("Reading from standard input ...",1);
      }
    else
      {
	$ifile=$ARGV[0];
	($ofile = $ifile) =~ s/\.[^.]+$//;
      }
    $ofile .= ".statistics";

    mess("Input:\t $ifile",1);
    mess("Output:\t $ofile",1);
  }

sub mess
  {
    my($msg,$pr)=@_;
    if($pr <= $prior)
      {
        $nt=$pr-1; $ss="";
	for($x=0;$x<$nt;$x++){$ss.="\t";}
	print "$ss$msg \n" if($color==0);
        #1-YELLOW,BOLD,2-YELLOW,3-GREEN,         5-default
        print YELLOW,BOLD,"$msg \n",RESET if($pr==1 and $color==1);
        print YELLOW,"$ss$msg \n",RESET if($pr==2 and $color==1);
        print GREEN,"$ss$msg \n",RESET if($pr==3 and $color==1);
        print BLUE,"$ss$msg \n",RESET if($pr==4 and $color==1);
        print RESET,"$ss$msg \n" if($pr==5 and $color==1);
      }
  }

sub printhelp
{    
  print "\\
sylseg_sk_training [--dl debug level] [--help] [<input file>]
\t<input file>\t- list of the segmented words for the training. 
\t\-\-dl\t- Set the debug level. Control the amount of displayed information
\t\t\t  The debug level 0 displays nothing. The maximum level 5 displays full
\t\t\t  debugging report. The default debug level is 1.
\t\-\-help\t- display a short help text and exit\n";
exit(0);
}



