@phdthesis{Hagedorn2023,
  author    = {Hagedorn, Christopher},
  title     = {Parallel execution of causal structure learning on graphics processing units},
  doi       = {10.25932/publishup-59758},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-597582},
  school      = {Universit{\"a}t Potsdam},
  pages     = {8, 192},
  year      = {2023},
  abstract  = {Learning the causal structures from observational data is an omnipresent challenge in data science. The amount of observational data available to Causal Structure Learning (CSL) algorithms is increasing as data is collected at high frequency from many data sources nowadays. While processing more data generally yields higher accuracy in CSL, the concomitant increase in the runtime of CSL algorithms hinders their widespread adoption in practice. CSL is a parallelizable problem. Existing parallel CSL algorithms address execution on multi-core Central Processing Units (CPUs) with dozens of compute cores. However, modern computing systems are often heterogeneous and equipped with Graphics Processing Units (GPUs) to accelerate computations. Typically, these GPUs provide several thousand compute cores for massively parallel data processing. To shorten the runtime of CSL algorithms, we design efficient execution strategies that leverage the parallel processing power of GPUs. Particularly, we derive GPU-accelerated variants of a well-known constraint-based CSL method, the PC algorithm, as it allows choosing a statistical Conditional Independence test (CI test) appropriate to the observational data characteristics. Our two main contributions are: (1) to reflect differences in the CI tests, we design three GPU-based variants of the PC algorithm tailored to CI tests that handle data with the following characteristics. We develop one variant for data assuming the Gaussian distribution model, one for discrete data, and another for mixed discrete-continuous data and data with non-linear relationships. Each variant is optimized for the appropriate CI test leveraging GPU hardware properties, such as shared or thread-local memory. Our GPU-accelerated variants outperform state-of-the-art parallel CPU-based algorithms by factors of up to 93.4× for data assuming the Gaussian distribution model, up to 54.3× for discrete data, up to 240× for continuous data with non-linear relationships and up to 655× for mixed discrete-continuous data. However, the proposed GPU-based variants are limited to datasets that fit into a single GPU's memory. (2) To overcome this shortcoming, we develop approaches to scale our GPU-based variants beyond a single GPU's memory capacity. For example, we design an out-of-core GPU variant that employs explicit memory management to process arbitrary-sized datasets. Runtime measurements on a large gene expression dataset reveal that our out-of-core GPU variant is 364 times faster than a parallel CPU-based CSL algorithm. Overall, our proposed GPU-accelerated variants speed up CSL in numerous settings to foster CSL's adoption in practice and research.},
  language  = {en}
}
@phdthesis{Goethe2009,
  author    = {G{\"o}the, Katrin},
  title     = {The limits of parallel processing},
  url       = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-46063},
  school      = {Universit{\"a}t Potsdam},
  year      = {2009},
  abstract  = {Trying to do two things at once decreases performance of one or both tasks in many cases compared to the situation when one performs each task by itself. The present thesis deals with the question why and in which cases these dual-task costs emerge and moreover, whether there are cases in which people are able to process two cognitive tasks at the same time without costs. In four experiments the influence of stimulus-response (S-R) compatibility, S-R modality pairings, interindividual differences, and practice on parallel processing ability of two tasks are examined. Results show that parallel processing is possible. Nevertheless, dual-task costs emerge when: the personal processing strategy is serial, the two tasks have not been practiced together, S-R compatibility of both tasks is low (e.g. when a left target has to be responded with a right key press and in the other task an auditorily presented "A" has to be responded by saying "B"), and modality pairings of both tasks are Non Standard (i.e., visual-spatial stimuli are responded vocally whereas auditory-verbal stimuli are responded manually). Results are explained with respect to executive-based (S-R compatibility) and content-based crosstalk (S-R modality pairings) between tasks. Finally, an alternative information processing account with respect to the central stage of response selection (i.e., the translation of the stimulus to the response) is presented.},
  language  = {en}
}