#!/usr/bin/perl  -w

use strict;
use Data::Dumper;
use Carp;
use Time::Local;

scalar @ARGV > 4 or die<<USAGE;
Usage:
perl -w convert.pl <filename.xml> <program> <No.markers> <g/h> <out_filename>
USAGE
my $fname=shift;
my $program = shift;  	#for which program 
my $nmks= shift; 	#specify the number of markers
my $datatype = shift;   #specify the type of data (genotype or haplotype)
my $outfilename = shift; 

##----------------------------------------------------------
##--------------------------choose marker--------------------
my $locid= "locidist$nmks";
open(LOCID,">$locid") or die "Couldn't Open File For Writing: $!\n";
print LOCID "<?xml version=\"1.0\"?>\n";
print LOCID "<data>\n";
print LOCID "<parameters>\n";

my $infilecm="$fname";       #input file
my $refcm = &parsexml($infilecm);
my $totalncm = $refcm->{"data"}->{"note"}->{"total_num_genealogies"};

my $tmp; my $tmp2;
my @mklist; my @mklistsort;
my $seed;
open(RAND, "/dev/random")
       or die "No /dev/random?\n";
read(RAND,$seed,4);
close(RAND);
srand(unpack("L", $seed));

for(my $i=1; $i<=$totalncm; $i++)
{
   my $N="num_$i";
   my $totloci= $refcm->{"data"}->{$N}->{"num_loci"}; 
   &chooseone($totloci, $nmks, $i);
}
print LOCID "</parameters>\n";
print LOCID "</data>\n";
close(LOCID);


##-----------------------------------------------------
##-------------------------convert---------------------
my $starttime=localtime;
my $infile="$fname";	#input file
my $locifile="locidist$nmks";
my $genofile="genopair";


#use function in package to parse xml input file
my $ref = &parsexml($infile);
my $refloci = &parsexml($locifile);
#print Dumper($ref);

my @check; my @width;
my $totaln = $ref->{"data"}->{"note"}->{"total_num_genealogies"};
my @truemutloc;
my @estmutloc;
my @check_cond;
system "rm -f -r $outfilename";
system "mkdir $outfilename";

for(my $i=1; $i<=$totaln; $i++)
{
    my $N="num_$i";  my $bN="Num_$i";
    my $nnormal= $ref->{"data"}->{"parameters"}->{"num_normal"}; 
    my $ndisease= $ref->{"data"}->{"parameters"}->{"num_disease"}; 
    my $grate= $ref->{"data"}->{"parameters"}->{"growth_rate"};	
    my $Ne= $ref->{"data"}->{"parameters"}->{"population_size"};	
    my $mutloc= $ref->{"data"}->{"parameters"}->{"mutation_location"};
    my $intervSize = $ref->{"data"}->{"parameters"}->{"interval_size"};
    my $loci= $refloci->{"data"}->{"parameters"}->{$bN};
    my @nloci= split(/\s/, $loci); 
    my $nmarkers= $nmks;
    my $disfre= $ref->{"data"}->{$N}->{"dis_frequency"};
    my $length= $ref->{"data"}->{$N}->{"interval_size"};

   #################################################################################
   ####################################   DMLE  ####################################
   #################################################################################
   if($program eq 'DMLE')
 {
    my $datafile= "$outfilename/".$outfilename.$i;
    open(OUTFILE, ">$datafile") or die "Couldn't Open File For Writing: $!\n";
    print OUTFILE "Data as genotypes? Yes = 1, No = 0(UNIX)\n";
    if($datatype eq 'g') {print OUTFILE "1\n";}
    else {print OUTFILE "0\n";}
    print OUTFILE "Genetic model: Dominant=0,Recessive=1\n0\n";
    print OUTFILE "Read old file?: (0=no, 1=yes):\n0\n";
    print OUTFILE "Use fixed random seed?:\n0\n";
    print OUTFILE "# chromosomes (N, in disease sample):\n$ndisease\n";
    print OUTFILE "# loci per chromosome (L):\n$nmks\n";
    print OUTFILE "Numbers of haplotypes in the normal(base) pop.:\n";
     	
    if($datatype eq "h")
    { 
        for(my $x=0; $x< $nnormal; $x++)
        {
            print OUTFILE "1 ";
            for(my $z=0; $z<$nmks; $z++)
            {
                my $tmp="n$x";
                my $array= $ref->{"data"}->{$N}->{"sequences_normal"}->{$tmp};
                my @narray = split(/\s/, $array);
                print OUTFILE "$narray[$nloci[$z]] "; 
            }
            print  OUTFILE "\n";
        }
    }
    else
    {
        for(my $x=0; $x< $nnormal; $x++)
        {
            for(my $z=0; $z<$nmks; $z++)
            {
                my $tmp="n$x"; my $tmp2 = $tmp;
                $tmp = $tmp."a"; $tmp2 = $tmp2."b";
                my $array1 = $ref->{"data"}->{$N}->{"sequences_normal"}->{$tmp};
                my $array2 = $ref->{"data"}->{$N}->{"sequences_normal"}->{$tmp2};
                my @narray1 = split(/\s/, $array1);
                my @narray2 = split(/\s/, $array2);
                print OUTFILE "$narray1[$nloci[$z]]/$narray2[$nloci[$z]] "; 
            }
            print  OUTFILE "\n";
        }
    }
   
       
    print OUTFILE "Map distances:\n";   
    my $mapdis= $ref->{"data"}->{$N}->{"phy_dis"};
    my @nmapdis= split(/\s/, $mapdis);  
    my $firstmk = $nmapdis[$nloci[0]]; 
    print OUTFILE "0.0 "; 
    for(my $z=1; $z<$nmks; $z++)
    {
        my $temp = $nmapdis[$nloci[$z]] - $firstmk;
        $temp /=100;  		#transform in Morgans  
	print OUTFILE "$temp ";
    }	
    print OUTFILE "\n";	
    print OUTFILE "Run simulation?:\n0\n";
    my $tmutloc= $mutloc - $firstmk; $tmutloc/=100;
    print OUTFILE "Mutation location:\n$tmutloc\n";
    $truemutloc[$i] = $tmutloc;

    print OUTFILE "Mutation's low and high boundaries\n";
    my $lb = 0-$firstmk;     $lb/=100;
    my $hb = $intervSize - $firstmk;   $hb/=100;
    print OUTFILE "$lb $hb\n";
    print OUTFILE "# simultaneous runs:\n1\n";
    print OUTFILE "Starting value(s) for mutation location for each simultaneous run:\n-99\n";
    print OUTFILE "Population growth rate:\n$grate\n"; 
    my $temp=  $ndisease/(2*$Ne*$disfre);
    print OUTFILE "Proportion of population sampled:\n$temp\n";
    print OUTFILE "Iterate ancestral states,mutation age,mutation location, allele freq. (0=no, 1=yes):\n1 1 1 1\n";
    print OUTFILE "Flip (potentially) all loci? (0=no, 1=yes):\n0\n";
    print OUTFILE "Adjustment level for tree, recdist, ancestral, and internal states,alleles:\n"; 
    print OUTFILE "5.0   0.005   0.5     0.5   0.1\n";
    print OUTFILE "Burn-in iterations:\n1000000\n";
    print OUTFILE "Iterations:\n1000000\n";
    print OUTFILE "Screen and file update intervals, save acceptance rates, save\n100 100  1 1 1\n";
    print OUTFILE "Number of histogram bars:\n200\n";
    print OUTFILE "Alpha level for recdist histogram:\n0.05\n";
    print OUTFILE "Mutation age (-99 for random):\n-99\n";
    print OUTFILE "Mutation age boundaries:\n10\t10000\n";
    print OUTFILE "Star genealogy (0=no, 1=yes):\n0\n";
    print OUTFILE "Loci for the root  (1xL) (-99):\n";
    for(my $x=0; $x< $nmarkers; $x++)
    { print OUTFILE "1 "; }
    print OUTFILE "\nPatient haplo- or genotypes:\n";
    
    if($datatype eq 'h')
    {
        for(my $x=0; $x< $ndisease; $x++)
        {
            print OUTFILE "1 ";
            for(my $z=0; $z<$nmks; $z++)
            {
                my $tmp="n$x";
                my $array= $ref->{data}->{$N}->{sequences_disease}->{$tmp};
                my @narray = split(/\s/, $array);
                print OUTFILE "$narray[$nloci[$z]] ";
            }
            print  OUTFILE "\n";
        }
    }
    else
    {
        for(my $x=0; $x< $ndisease; $x++)
        {
            for(my $z=0; $z<$nmks; $z++)
            {
                my $tmp="n$x"; my $tmp2 = $tmp;
                $tmp = $tmp."a"; $tmp2 = $tmp2."b";
                my $array1 = $ref->{"data"}->{$N}->{"sequences_disease"}->{$tmp};
                my $array2 = $ref->{"data"}->{$N}->{"sequences_disease"}->{$tmp2};
                my @narray1 = split(/\s/, $array1);
                my @narray2 = split(/\s/, $array2);
                print OUTFILE "$narray1[$nloci[$z]]/$narray2[$nloci[$z]] "; 
            }
            print  OUTFILE "\n";
        }
    }

    print OUTFILE "Use sequence weights?\n0\n";
    print OUTFILE "Weights for exons,introns,non-genes\n";
    close(OUTFILE);
  }
  #----------------------------finish one inputfile for DMLE------------------------------
  else {die "Couldn't find the program: $program !\n";}
}
system "rm -f $locid";

#-------------------------sub routines-------------------------------
sub parsexml($)
{
	my $input = shift;
	my $ret = {};
	open IN, "<$input" or warn "Cannot open input file $input!\n";

	my $data = {};

	while (<IN>)
	{
		if (/\<parameters\>/i)
		{
			my $pars = {};
			while (<IN>)
			{
				if (/\<\/Parameters\>/i)
				{	last;		}
				if (/\<(.*)\>(.*)\<\/(.*)\>/)
				{	$pars->{$1} = $2;	}
			}
			$data->{"parameters"} = $pars;
		}
		if (/\<num_(.*)\>/i)
		{
			my $num = {};
			my $name = "num_$1";
			while (<IN>)
			{
				if (/\<\/$name\>/i)
				{	last;		}
				if (/\<(haplotypes|genotypes)_normal\>/i)
				{
					my $seq_normal = {};
					while (<IN>)
					{
						if (/\<\/(haplotypes|genotypes)_normal\>/i)
						{	last;		}
						if (/\<(.*)\>(.*)\<\/(.*)\>/)
						{	$seq_normal->{$1} = $2;	}
					}
					$num->{"sequences_normal"} = $seq_normal;
				}
				if (/\<(haplotypes|genotypes)_disease\>/i)
				{
					my $seq_disease = {};
					while (<IN>)
					{
						if (/\<\/(haplotypes|genotypes)_disease\>/i)
						{	last;		}
						if (/\<(.*)\>(.*)\<\/(.*)\>/)
						{	$seq_disease->{$1} = $2;	}
					}
					$num->{"sequences_disease"} = $seq_disease;
				}
				if (/\<(.*)\>(.*)\<\/(.*)\>/)
				{	$num->{$1} = $2;	}
			}
			$data->{$name} = $num;
		}
		if (/\<note\>/i)
		{
			my $note = {};
			while (<IN>)
			{
				if (/\<\/note\>/i)
				{	last;		}
				if (/\<(.*)\>(.*)\<\/(.*)\>/)
				{	$note->{$1} = $2;	}
			}
			$data->{"note"} = $note;
		}	
	}
	close IN;

	$ret->{"data"} = $data;
	return $ret;
}

##-----------------choosemarker-------------
sub by_number_rev{
   if($a < $b)    {return -1;}
   elsif($a ==$b) {return 0; }
   elsif($a > $b) {return 1;} 
}

sub chooseone{
   my $totloci=shift;   
   my $nloci=shift;    ##how many loci needed to be chosen
   my $num=shift;

   my @array;
   for(my $i=0; $i<$totloci; $i++) {$array[$i]=$i;}
   my @resarray;

   for( my $z=0; $z<$nloci; $z++)
   {
      my $tmp= rand()*$totloci;
      if($tmp =~ /(.*)\.(.*)/) {$tmp=$1;}   ##get the integer part
      $resarray[$z]= $array[$tmp];

      my $p=0; my @backarray;
      for(my $i=0; $i<$totloci; $i++) {$backarray[$i] = $array[$i];} @array=();
      for(my $i=0; $i<$totloci; $i++)
      {
        if($i!=$tmp) { $array[$p]= $backarray[$i]; $p++;}
      }
     $totloci--;
   }
   my @resarraysort= sort by_number_rev @resarray;
   print LOCID "<Num_$num>";
   for( my $z=0; $z<$nloci; $z++) {print LOCID $resarraysort[$z], " ";}
   print LOCID "</Num_$num>\n";
}

