#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>

/* OVERVIEW:
 * Phylogeny reads a file containing a list of species names and genomes.
 * It then arranges the species into a phylogeny tree that minimizes
 * the number of mutations required to produce the tree.
 */

#include "Species.h"
#include "SpeciesSet.h"
#include "SpeciesTree.h"

static unsigned mutationDistance(const char *left,const char *right) {
/* EFFECTS: Returns the number of point mutations required to turn 
 *	    the left genome into the right genome.
 *          Each mutation changes the value of one base.  
 *          (We are not allowing insertions, deletions, etc. for now.)
 */
    unsigned mutations;
    unsigned size;
    unsigned i;

    if (strlen(left) > strlen(right)) {
	mutations = (unsigned) (strlen(left)-strlen(right));
	size = (unsigned) strlen(right);
    } else {
	mutations = (unsigned) (strlen(right)-strlen(left));
	size = (unsigned) strlen(left);
    }

    for (i=0; i < size; i++)
        if (left[i] != right[i])
            mutations++;

    return (unsigned)mutations;
}

static unsigned parsimonyValue (SpeciesTree tree) {
    /* EFFECTS: Returns the parsimony value of tree. */

  unsigned score = 0;
  SpeciesTree left = tree->left;
  SpeciesTree right = tree->right;

  if (left != NULL) {
    score = score + mutationDistance (tree->root->genome,
				      left->root->genome) 
      + parsimonyValue (left);
  }

  if (right != NULL) {
    score = score + mutationDistance (tree->root->genome,
				      right->root->genome) 
      + parsimonyValue (right);
  }

  return score;
}

/* Prototype for chooseRoot, defined below.
 * This is needed because search_rooted_trees and chooseRoot
 * are mutually recursive.
 * Note: if you need to add an annotation to chooseRoot, you must
 * change its definition in both places.
 */

static SpeciesTree chooseRoot (SpeciesSet set, Species grandparent);

static SpeciesTree findBestTreeRoot (SpeciesSet children,
				 /*@dependent@*/ Species root) {
/* REQUIRES: root != NULL
 * EFFECTS:  Returns the best-scoring tree with the specified <root>
 *           and <children>.
 */
    unsigned best_score = (unsigned)-1;
    SpeciesTree best_tree = NULL;
    unsigned size = SpeciesSet_size(children);
    unsigned n_splits = (1 << size);
    unsigned i;

    /* Iterate over each of the n_splits ways of splitting the children
     * into two sets.
     */
    for (i=0; i < n_splits; i++) {
        unsigned score;
        SpeciesSet left = SpeciesSet_new ();
        SpeciesSet right = SpeciesSet_new ();
        SpeciesTree tree = SpeciesTree_new (root);

	/* Split the children into two sets */
        SpeciesSet_split(children,i,left,right);

	/* Arrange each subset into a subtree */
        SpeciesTree_setLeft (tree, chooseRoot (left, root));
        SpeciesTree_setRight (tree, chooseRoot (right, root));

        score = parsimonyValue (tree);
        if (score < best_score) {
	  best_tree = tree;
	  best_score = score;
        }
    }

    if (best_tree != NULL) {
        return best_tree;
    } else {
        printf("BUG: best_tree is NULL\n");
        exit(EXIT_FAILURE);
    }
}

SpeciesTree chooseRoot (SpeciesSet set, Species grandparent) {
  /* REQUIRES: grandparent != NULL
  ** EFFECTS:  Determines which Species in <set> would make the best root
  **	     under <grandparent>, and returns a tree with that root and all
  **	     other elements of <set> as its children.
  */
  
  unsigned best_score = (unsigned)-1;
  SpeciesTree best_tree = NULL;
  
  if (SpeciesSet_isEmpty (set)) {
    return best_tree;
  }
  
  /* Iterate over each potential root of the tree */
  SpeciesSet_forall(set,root) {
    unsigned score;
    SpeciesTree tree;
    SpeciesSet children;
    
    /* Create a copy of the set of nodes, and remove the root */
    children = SpeciesSet_copy (set);
    SpeciesSet_remove (children, root);
    
    /* Find the best subtree with that root */
    tree = findBestTreeRoot (children, root);
    
    /* Compute the score of this subtree, and the mutationDistance
     * between this subtree's root and the node it branches off of 
     */
    score = parsimonyValue (tree)+
      mutationDistance (root->genome, grandparent->genome);
    
    if (score < best_score) {
      best_score = score;
      best_tree = tree;
    }
  } end_SpeciesSet_forall;
  
  return best_tree;
}

# define MAX_LINE_LENGTH 1024

int main(int argc, char *argv[]) {
  if (argc != 2) {
    printf("Usage: phylogeny <file>\n");
    exit (EXIT_FAILURE);
  } else {
    /* Read species names and genomes from file */
    FILE *f;
    char *name;
    char *genome;
    char line[MAX_LINE_LENGTH];
    SpeciesSet set = SpeciesSet_new();
    Species root;
    bool gotroot = FALSE;
    
    f = fopen(argv[1], "rt");
        
    while (fgets (line, MAX_LINE_LENGTH, f)) {
      char *genomestart;
      
      int index = 0;
      
      while (line[index] != '\0' && !isspace (line[index])) {
		index++;
      }
      
      if (line[index] == '\0' || index == 0) {
		fprintf (stderr, "Badly formatted line: %s\n", line);
		exit (EXIT_FAILURE);
      }

      name = (char *) malloc (sizeof (*name) * (index + 1));

      if (name == NULL) {
		fprintf (stderr, "Out of memory reading file.\n");
		exit (EXIT_FAILURE);
      }

      strncpy (name, line, (size_t) index);
	  name[index] = '\0';
	  
      if (SpeciesSet_contains (set,name) 
		  || (gotroot 
			  && /*@-usedef@*/  /* Splint reports a possibly undefined use of root here, but we know
			       	               it is defined because of gotroot. */
				 (strcmp (Species_getName (root), name) == 0)
			     /*@=usedef@*/
			  )) {
		fprintf (stderr, "Duplicate species in file: %s\n",name);
		exit (EXIT_FAILURE);
	  } 

      /*
      ** Skip spaces
      */
      
      while (line[index] != '\0' && isspace (line[index])) {
		index++;
      }
      
      genomestart = &line[index];
      index = 0;
      
      while (genomestart[index] != '\0' && !isspace (genomestart[index])) {
		index++;
      }

      if (index == 0) {
		fprintf (stderr, "Badly formatted line (no genome): %s\n", line);
		exit (EXIT_FAILURE);
      }
      
      genome = (char *) malloc (sizeof (*genome) * (index + 1));

      if (genome == NULL) {
		fprintf (stderr, "Out of memory reading genome.\n");
		exit (EXIT_FAILURE);
      }

      strncpy (genome, genomestart, (size_t) index);
	  genome[index] = '\0';
	  
      if (!gotroot) {
		root = Species_new (name, genome);
		gotroot = TRUE;
      } else {
		Species snew = Species_new (name, genome);
		fprintf (stderr, "Species: %s\n", Species_toString (snew));
		Species_markOwned (snew);
		SpeciesSet_insert (set, snew);
      }

		
    }
    
    /* Find best tree with specified root, and print it */
    if (!gotroot) {
      printf("No root species (empty file)\n");
      exit(EXIT_FAILURE);
    } else {
      SpeciesTree tree;
      char *st;
      /* We know root is defined because of gotroot, but Splint can't determine this. */
      tree = findBestTreeRoot (set, /*@-usedef@*/ root /*@=usedef@*/);
      st = SpeciesTree_toString (tree);

      printf ("Best tree: \n%s\n", st);
      free (st);

      printf("Parsimony value: %u\n", parsimonyValue (tree));
    }
  }
  
  return EXIT_SUCCESS;
}