#include <stdio.h> #include <string.h> #include <stdlib.h> #include <ctype.h> /* OVERVIEW: * Phylogeny reads a file containing a list of species names and genomes. * It then arranges the species into a phylogeny tree that minimizes * the number of mutations required to produce the tree. */ #include "Species.h" #include "SpeciesSet.h" #include "SpeciesTree.h" static unsigned mutationDistance(const char *left,const char *right) { /* EFFECTS: Returns the number of point mutations required to turn * the left genome into the right genome. * Each mutation changes the value of one base. * (We are not allowing insertions, deletions, etc. for now.) */ unsigned mutations; unsigned size; unsigned i; if (strlen(left) > strlen(right)) { mutations = (unsigned) (strlen(left)-strlen(right)); size = (unsigned) strlen(right); } else { mutations = (unsigned) (strlen(right)-strlen(left)); size = (unsigned) strlen(left); } for (i=0; i < size; i++) if (left[i] != right[i]) mutations++; return (unsigned)mutations; } static unsigned parsimonyValue (SpeciesTree tree) { /* EFFECTS: Returns the parsimony value of tree. */ unsigned score = 0; SpeciesTree left = tree->left; SpeciesTree right = tree->right; if (left != NULL) { score = score + mutationDistance (tree->root->genome, left->root->genome) + parsimonyValue (left); } if (right != NULL) { score = score + mutationDistance (tree->root->genome, right->root->genome) + parsimonyValue (right); } return score; } /* Prototype for chooseRoot, defined below. * This is needed because search_rooted_trees and chooseRoot * are mutually recursive. * Note: if you need to add an annotation to chooseRoot, you must * change its definition in both places. */ static SpeciesTree chooseRoot (SpeciesSet set, Species grandparent); static SpeciesTree findBestTreeRoot (SpeciesSet children, /*@dependent@*/ Species root) { /* REQUIRES: root != NULL * EFFECTS: Returns the best-scoring tree with the specified <root> * and <children>. */ unsigned best_score = (unsigned)-1; SpeciesTree best_tree = NULL; unsigned size = SpeciesSet_size(children); unsigned n_splits = (1 << size); unsigned i; /* Iterate over each of the n_splits ways of splitting the children * into two sets. */ for (i=0; i < n_splits; i++) { unsigned score; SpeciesSet left = SpeciesSet_new (); SpeciesSet right = SpeciesSet_new (); SpeciesTree tree = SpeciesTree_new (root); /* Split the children into two sets */ SpeciesSet_split(children,i,left,right); /* Arrange each subset into a subtree */ SpeciesTree_setLeft (tree, chooseRoot (left, root)); SpeciesTree_setRight (tree, chooseRoot (right, root)); score = parsimonyValue (tree); if (score < best_score) { best_tree = tree; best_score = score; } } if (best_tree != NULL) { return best_tree; } else { printf("BUG: best_tree is NULL\n"); exit(EXIT_FAILURE); } } SpeciesTree chooseRoot (SpeciesSet set, Species grandparent) { /* REQUIRES: grandparent != NULL ** EFFECTS: Determines which Species in <set> would make the best root ** under <grandparent>, and returns a tree with that root and all ** other elements of <set> as its children. */ unsigned best_score = (unsigned)-1; SpeciesTree best_tree = NULL; if (SpeciesSet_isEmpty (set)) { return best_tree; } /* Iterate over each potential root of the tree */ SpeciesSet_forall(set,root) { unsigned score; SpeciesTree tree; SpeciesSet children; /* Create a copy of the set of nodes, and remove the root */ children = SpeciesSet_copy (set); SpeciesSet_remove (children, root); /* Find the best subtree with that root */ tree = findBestTreeRoot (children, root); /* Compute the score of this subtree, and the mutationDistance * between this subtree's root and the node it branches off of */ score = parsimonyValue (tree)+ mutationDistance (root->genome, grandparent->genome); if (score < best_score) { best_score = score; best_tree = tree; } } end_SpeciesSet_forall; return best_tree; } # define MAX_LINE_LENGTH 1024 int main(int argc, char *argv[]) { if (argc != 2) { printf("Usage: phylogeny <file>\n"); exit (EXIT_FAILURE); } else { /* Read species names and genomes from file */ FILE *f; char *name; char *genome; char line[MAX_LINE_LENGTH]; SpeciesSet set = SpeciesSet_new(); Species root; bool gotroot = FALSE; f = fopen(argv[1], "rt"); while (fgets (line, MAX_LINE_LENGTH, f)) { char *genomestart; int index = 0; while (line[index] != '\0' && !isspace (line[index])) { index++; } if (line[index] == '\0' || index == 0) { fprintf (stderr, "Badly formatted line: %s\n", line); exit (EXIT_FAILURE); } name = (char *) malloc (sizeof (*name) * (index + 1)); if (name == NULL) { fprintf (stderr, "Out of memory reading file.\n"); exit (EXIT_FAILURE); } strncpy (name, line, (size_t) index); name[index] = '\0'; if (SpeciesSet_contains (set,name) || (gotroot && /*@-usedef@*/ /* Splint reports a possibly undefined use of root here, but we know it is defined because of gotroot. */ (strcmp (Species_getName (root), name) == 0) /*@=usedef@*/ )) { fprintf (stderr, "Duplicate species in file: %s\n",name); exit (EXIT_FAILURE); } /* ** Skip spaces */ while (line[index] != '\0' && isspace (line[index])) { index++; } genomestart = &line[index]; index = 0; while (genomestart[index] != '\0' && !isspace (genomestart[index])) { index++; } if (index == 0) { fprintf (stderr, "Badly formatted line (no genome): %s\n", line); exit (EXIT_FAILURE); } genome = (char *) malloc (sizeof (*genome) * (index + 1)); if (genome == NULL) { fprintf (stderr, "Out of memory reading genome.\n"); exit (EXIT_FAILURE); } strncpy (genome, genomestart, (size_t) index); genome[index] = '\0'; if (!gotroot) { root = Species_new (name, genome); gotroot = TRUE; } else { Species snew = Species_new (name, genome); fprintf (stderr, "Species: %s\n", Species_toString (snew)); Species_markOwned (snew); SpeciesSet_insert (set, snew); } } /* Find best tree with specified root, and print it */ if (!gotroot) { printf("No root species (empty file)\n"); exit(EXIT_FAILURE); } else { SpeciesTree tree; char *st; /* We know root is defined because of gotroot, but Splint can't determine this. */ tree = findBestTreeRoot (set, /*@-usedef@*/ root /*@=usedef@*/); st = SpeciesTree_toString (tree); printf ("Best tree: \n%s\n", st); free (st); printf("Parsimony value: %u\n", parsimonyValue (tree)); } } return EXIT_SUCCESS; }