@article{MiethKloftRodriguezetal.2016, author = {Mieth, Bettina and Kloft, Marius and Rodriguez, Juan Antonio and Sonnenburg, Soren and Vobruba, Robin and Morcillo-Suarez, Carlos and Farre, Xavier and Marigorta, Urko M. and Fehr, Ernst and Dickhaus, Thorsten and Blanchard, Gilles and Schunk, Daniel and Navarro, Arcadi and M{\"u}ller, Klaus-Robert}, title = {Combining Multiple Hypothesis Testing with Machine Learning Increases the Statistical Power of Genome-wide Association Studies}, series = {Scientific reports}, volume = {6}, journal = {Scientific reports}, publisher = {Nature Publ. Group}, address = {London}, issn = {2045-2322}, doi = {10.1038/srep36671}, pages = {14}, year = {2016}, abstract = {The standard approach to the analysis of genome-wide association studies (GWAS) is based on testing each position in the genome individually for statistical significance of its association with the phenotype under investigation. To improve the analysis of GWAS, we propose a combination of machine learning and statistical testing that takes correlation structures within the set of SNPs under investigation in a mathematically well-controlled manner into account. The novel two-step algorithm, COMBI, first trains a support vector machine to determine a subset of candidate SNPs and then performs hypothesis tests for these SNPs together with an adequate threshold correction. Applying COMBI to data from a WTCCC study (2007) and measuring performance as replication by independent GWAS published within the 2008-2015 period, we show that our method outperforms ordinary raw p-value thresholding as well as other state-of-the-art methods. COMBI presents higher power and precision than the examined alternatives while yielding fewer false (i.e. non-replicated) and more true (i.e. replicated) discoveries when its results are validated on later GWAS studies. More than 80\% of the discoveries made by COMBI upon WTCCC data have been validated by independent studies. Implementations of the COMBI method are available as a part of the GWASpi toolbox 2.0.}, language = {en} } @article{KawanabeBlanchardSugiyamaetal.2006, author = {Kawanabe, Motoaki and Blanchard, Gilles and Sugiyama, Masashi and Spokoiny, Vladimir G. and M{\"u}ller, Klaus-Robert}, title = {A novel dimension reduction procedure for searching non-Gaussian subspaces}, issn = {0302-9743}, doi = {10.1007/11679363_19}, year = {2006}, abstract = {In this article, we consider high-dimensional data which contains a low-dimensional non-Gaussian structure contaminated with Gaussian noise and propose a new linear method to identify the non-Gaussian subspace. Our method NGCA (Non-Gaussian Component Analysis) is based on a very general semi-parametric framework and has a theoretical guarantee that the estimation error of finding the non-Gaussian components tends to zero at a parametric rate. NGCA can be used not only as preprocessing for ICA, but also for extracting and visualizing more general structures like clusters. A numerical study demonstrates the usefulness of our method}, language = {en} } @article{BlanchardKawanabeSugiyamaetal.2006, author = {Blanchard, Gilles and Kawanabe, Motoaki and Sugiyama, Masashi and Spokoiny, Vladimir G. and M{\"u}ller, Klaus-Robert}, title = {In search of non-Gaussian components of a high-dimensional distribution}, issn = {1532-4435}, year = {2006}, abstract = {Finding non-Gaussian components of high-dimensional data is an important preprocessing step for efficient information processing. This article proposes a new linear method to identify the '' non-Gaussian subspace '' within a very general semi-parametric framework. Our proposed method, called NGCA (non-Gaussian component analysis), is based on a linear operator which, to any arbitrary nonlinear (smooth) function, associates a vector belonging to the low dimensional non-Gaussian target subspace, up to an estimation error. By applying this operator to a family of different nonlinear functions, one obtains a family of different vectors lying in a vicinity of the target space. As a final step, the target space itself is estimated by applying PCA to this family of vectors. We show that this procedure is consistent in the sense that the estimaton error tends to zero at a parametric rate, uniformly over the family, Numerical examples demonstrate the usefulness of our method}, language = {en} }