@misc{PerscheidFaberKrausetal.2018, author = {Perscheid, Cindy and Faber, Lukas and Kraus, Milena and Arndt, Paul and Janke, Michael and Rehfeldt, Sebastian and Schubotz, Antje and Slosarek, Tamara and Uflacker, Matthias}, title = {A tissue-aware gene selection approach for analyzing multi-tissue gene expression data}, series = {2018 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, journal = {2018 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)}, publisher = {IEEE}, address = {New York}, isbn = {978-1-5386-5488-0}, issn = {2156-1125}, doi = {10.1109/BIBM.2018.8621189}, pages = {2159 -- 2166}, year = {2018}, abstract = {High-throughput RNA sequencing (RNAseq) produces large data sets containing expression levels of thousands of genes. The analysis of RNAseq data leads to a better understanding of gene functions and interactions, which eventually helps to study diseases like cancer and develop effective treatments. Large-scale RNAseq expression studies on cancer comprise samples from multiple cancer types and aim to identify their distinct molecular characteristics. Analyzing samples from different cancer types implies analyzing samples from different tissue origin. Such multi-tissue RNAseq data sets require a meaningful analysis that accounts for the inherent tissue-related bias: The identified characteristics must not originate from the differences in tissue types, but from the actual differences in cancer types. However, current analysis procedures do not incorporate that aspect. As a result, we propose to integrate a tissue-awareness into the analysis of multi-tissue RNAseq data. We introduce an extension for gene selection that provides a tissue-wise context for every gene and can be flexibly combined with any existing gene selection approach. We suggest to expand conventional evaluation by additional metrics that are sensitive to the tissue-related bias. Evaluations show that especially low complexity gene selection approaches profit from introducing tissue-awareness.}, language = {en} } @misc{PodlesnyKayemvonSchorlemeretal.2018, author = {Podlesny, Nikolai Jannik and Kayem, Anne V. D. M. and von Schorlemer, Stephan and Uflacker, Matthias}, title = {Minimising Information Loss on Anonymised High Dimensional Data with Greedy In-Memory Processing}, series = {Database and Expert Systems Applications, DEXA 2018, PT I}, volume = {11029}, journal = {Database and Expert Systems Applications, DEXA 2018, PT I}, publisher = {Springer}, address = {Cham}, isbn = {978-3-319-98809-2}, issn = {0302-9743}, doi = {10.1007/978-3-319-98809-2_6}, pages = {85 -- 100}, year = {2018}, abstract = {Minimising information loss on anonymised high dimensional data is important for data utility. Syntactic data anonymisation algorithms address this issue by generating datasets that are neither use-case specific nor dependent on runtime specifications. This results in anonymised datasets that can be re-used in different scenarios which is performance efficient. However, syntactic data anonymisation algorithms incur high information loss on high dimensional data, making the data unusable for analytics. In this paper, we propose an optimised exact quasi-identifier identification scheme, based on the notion of k-anonymity, to generate anonymised high dimensional datasets efficiently, and with low information loss. The optimised exact quasi-identifier identification scheme works by identifying and eliminating maximal partial unique column combination (mpUCC) attributes that endanger anonymity. By using in-memory processing to handle the attribute selection procedure, we significantly reduce the processing time required. We evaluated the effectiveness of our proposed approach with an enriched dataset drawn from multiple real-world data sources, and augmented with synthetic values generated in close alignment with the real-world data distributions. Our results indicate that in-memory processing drops attribute selection time for the mpUCC candidates from 400s to 100s, while significantly reducing information loss. In addition, we achieve a time complexity speed-up of O(3(n/3)) approximate to O(1.4422(n)).}, language = {en} }