#!/usr/bin/perl -w
#
# checkFormat_scoreMatrix.pl 
# 
# Verifies the proper format of the score Matrix.
#
# Created Mon. 28 Aug, 2006 by Lourdes Pena Castillo (lourdes dot pena at gmail.com)
# $Id: checkFormat_scoreMatrix.pl,v 1.1 2006/08/28 14:00:33 lourdes Exp lourdes $
# Copyright (C) 2006  Lourdes Pena Castillo
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# http://www.gnu.org/copyleft/gpl.html
#################################################################################
#################################################################################
use English;
use strict;

my $start = time();

my($scoreMatrix_file) = @ARGV; #Read arguments

#  Check values of arguments
if (!defined($scoreMatrix_file)) {
    die ("Insufficient arguments. Usage: checkFormat_scoreMatrix.pl  scoreMatrix_file\n");
}

open(MATRIX, $scoreMatrix_file) or die ("open($scoreMatrix_file): $!\n");

my $sep = "\t";
my $firstline = 1;
my $nc = 0;
my $errors = 0;
my $nl = 0;

while (my $line = <MATRIX>) {
    chomp($line);
    $nl++;
    my($gene, @fields) = split($sep, $line);
    if ($firstline) { #First line contains column labels
       $nc = scalar @fields;
       die "In first line, missing tab before GO_IDs\n" if ($gene =~ /(GO:\d{7})/ && $1);
       foreach my $tmp (@fields) { #check whether each column contains GO id
          print  "Error", ++$errors, ": $tmp contains no valid GO ID\n" unless ($tmp =~ /(GO:\d{7})/ && $1);
       }
       $firstline = 0;
    } else { #Check lines with scores
      print  "Error", ++$errors, ": $gene contains no valid Gene ID\n" unless ($gene =~ /(G\d{5})/ && $1);
      print  "Error", ++$errors, ": line $nl doesn't contain $nc scores\n" unless (scalar @fields == $nc);
      foreach my $score (@fields) {    #check each score
        print  "Error", ++$errors, ": score $score in line $nl has incorrect format or is outside range [0-1]\n" unless ($score =~ /^\s*\d+\.?\d*(e[-\+]\d+)?\s*$/i && $score >= 0 && $score <= 1);
      }
    }
}

close(MATRIX);

print "Found $errors formatting errors in file $scoreMatrix_file \n" if $errors;
print "The file $scoreMatrix_file has the correct format!\n" if (!$errors);

printf "Elapsed time: %d sec.\n", time() - $start;

exit;

#################################################################################
