@article{BurleighBansalEulensteinetal.2011,
  author    = {Burleigh, J. Gordon and Bansal, Mukul S. and Eulenstein, Oliver and Hartmann, Stefanie and Wehe, Andre and Vision, Todd J.},
  title     = {Genome-Scale Phylogenetics inferring the plant tree of life from 18,896 gene trees},
  series = {Systematic biology},
  volume    = {60},
  journal   = {Systematic biology},
  number    = {2},
  publisher = {Oxford Univ. Press},
  address   = {Oxford},
  issn      = {1063-5157},
  doi       = {10.1093/sysbio/syq072},
  pages     = {117 -- 125},
  year      = {2011},
  abstract  = {Phylogenetic analyses using genome-scale data sets must confront incongruence among gene trees, which in plants is exacerbated by frequent gene duplications and losses. Gene tree parsimony (GTP) is a phylogenetic optimization criterion in which a species tree that minimizes the number of gene duplications induced among a set of gene trees is selected. The run time performance of previous implementations has limited its use on large-scale data sets. We used new software that incorporates recent algorithmic advances to examine the performance of GTP on a plant data set consisting of 18,896 gene trees containing 510,922 protein sequences from 136 plant taxa (giving a combined alignment length of >2.9 million characters). The relationships inferred from the GTP analysis were largely consistent with previous large-scale studies of backbone plant phylogeny and resolved some controversial nodes. The placement of taxa that were present in few gene trees generally varied the most among GTP bootstrap replicates. Excluding these taxa either before or after the GTP analysis revealed high levels of phylogenetic support across plants. The analyses supported magnoliids sister to a eudicot + monocot clade and did not support the eurosid I and II clades. This study presents a nuclear genomic perspective on the broad-scale phylogenic relationships among plants, and it demonstrates that nuclear genes with a history of duplication and loss can be phylogenetically informative for resolving the plant tree of life.},
  language  = {en}
}
@article{ChengHartmannGuptaetal.2009,
  author    = {Cheng, Fuxia and Hartmann, Stefanie and Gupta, Mayetri and Ibrahim, Joseph G. and Vision, Todd J.},
  title     = {A hierarchical model for incomplete alignments in phylogenetic inference},
  issn      = {1367-4803},
  doi       = {10.1093/bioinformatics/btp015},
  year      = {2009},
  abstract  = {Motivation: Full-length DNA and protein sequences that span the entire length of a gene are ideally used for multiple sequence alignments (MSAs) and the subsequent inference of their relationships. Frequently, however, MSAs contain a substantial amount of missing data. For example, expressed sequence tags (ESTs), which are partial sequences of expressed genes, are the predominant source of sequence data for many organisms. The patterns of missing data typical for EST-derived alignments greatly compromise the accuracy of estimated phylogenies. Results: We present a statistical method for inferring phylogenetic trees from EST-based incomplete MSA data. We propose a class of hierarchical models for modeling pairwise distances between the sequences, and develop a fully Bayesian approach for estimation of the model parameters. Once the distance matrix is estimated, the phylogenetic tree may be constructed by applying neighbor-joining (or any other algorithm of choice). We also show that maximizing the marginal likelihood from the Bayesian approach yields similar results to a pro. le likelihood estimation. The proposed methods are illustrated using simulated protein families, for which the true phylogeny is known, and one real protein family.},
  language  = {en}
}
@misc{HartmannVision2008,
  author    = {Hartmann, Stefanie and Vision, Todd J.},
  title     = {Using ESTs for phylogenomics},
  series = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe},
  journal   = {Postprints der Universit{\"a}t Potsdam : Mathematisch Naturwissenschaftliche Reihe},
  number    = {889},
  issn      = {1866-8372},
  doi       = {10.25932/publishup-43667},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-436670},
  pages     = {15},
  year      = {2008},
  abstract  = {Background While full genome sequences are still only available for a handful of taxa, large collections of partial gene sequences are available for many more. The alignment of partial gene sequences results in a multiple sequence alignment containing large gaps that are arranged in a staggered pattern. The consequences of this pattern of missing data on the accuracy of phylogenetic analysis are not well understood. We conducted a simulation study to determine the accuracy of phylogenetic trees obtained from gappy alignments using three commonly used phylogenetic reconstruction methods (Neighbor Joining, Maximum Parsimony, and Maximum Likelihood) and studied ways to improve the accuracy of trees obtained from such datasets. Results We found that the pattern of gappiness in multiple sequence alignments derived from partial gene sequences substantially compromised phylogenetic accuracy even in the absence of alignment error. The decline in accuracy was beyond what would be expected based on the amount of missing data. The decline was particularly dramatic for Neighbor Joining and Maximum Parsimony, where the majority of gappy alignments contained 25\% to 40\% incorrect quartets. To improve the accuracy of the trees obtained from a gappy multiple sequence alignment, we examined two approaches. In the first approach, alignment masking, potentially problematic columns and input sequences are excluded from from the dataset. Even in the absence of alignment error, masking improved phylogenetic accuracy up to 100-fold. However, masking retained, on average, only 83\% of the input sequences. In the second approach, alignment subdivision, the missing data is statistically modelled in order to retain as many sequences as possible in the phylogenetic analysis. Subdivision resulted in more modest improvements to alignment accuracy, but succeeded in including almost all of the input sequences. Conclusion These results demonstrate that partial gene sequences and gappy multiple sequence alignments can pose a major problem for phylogenetic analysis. The concern will be greatest for high-throughput phylogenomic analyses, in which Neighbor Joining is often the preferred method due to its computational efficiency. Both approaches can be used to increase the accuracy of phylogenetic inference from a gappy alignment. The choice between the two approaches will depend upon how robust the application is to the loss of sequences from the input set, with alignment masking generally giving a much greater improvement in accuracy but at the cost of discarding a larger number of the input sequences.},
  language  = {en}
}