@phdthesis{Podlesny2023, author = {Podlesny, Nikolai Jannik}, title = {Quasi-identifier discovery to prevent privacy violating inferences in large high dimensional datasets}, doi = {10.25932/publishup-58784}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-587843}, school = {Universit{\"a}t Potsdam}, pages = {xvi, 140}, year = {2023}, abstract = {Personal data privacy is considered to be a fundamental right. It forms a part of our highest ethical standards and is anchored in legislation and various best practices from the technical perspective. Yet, protecting against personal data exposure is a challenging problem from the perspective of generating privacy-preserving datasets to support machine learning and data mining operations. The issue is further compounded by the fact that devices such as consumer wearables and sensors track user behaviours on such a fine-grained level, thereby accelerating the formation of multi-attribute and large-scale high-dimensional datasets. In recent years, increasing news coverage regarding de-anonymisation incidents, including but not limited to the telecommunication, transportation, financial transaction, and healthcare sectors, have resulted in the exposure of sensitive private information. These incidents indicate that releasing privacy-preserving datasets requires serious consideration from the pre-processing perspective. A critical problem that appears in this regard is the time complexity issue in applying syntactic anonymisation methods, such as k-anonymity, l-diversity, or t-closeness to generating privacy-preserving data. Previous studies have shown that this problem is NP-hard. This thesis focuses on large high-dimensional datasets as an example of a special case of data that is characteristically challenging to anonymise using syntactic methods. In essence, large high-dimensional data contains a proportionately large number of attributes in proportion to the population of attribute values. Applying standard syntactic data anonymisation approaches to generating privacy-preserving data based on such methods results in high information-loss, thereby rendering the data useless for analytics operations or in low privacy due to inferences based on the data when information loss is minimised. We postulate that this problem can be resolved effectively by searching for and eliminating all the quasi-identifiers present in a high-dimensional dataset. Essentially, we quantify the privacy-preserving data sharing problem as the Find-QID problem. Further, we show that despite the complex nature of absolute privacy, the discovery of QID can be achieved reliably for large datasets. The risk of private data exposure through inferences can be circumvented, and both can be practicably achieved without the need for high-performance computers. For this purpose, we present, implement, and empirically assess both mathematical and engineering optimisation methods for a deterministic discovery of privacy-violating inferences. This includes a greedy search scheme by efficiently queuing QID candidates based on their tuple characteristics, projecting QIDs on Bayesian inferences, and countering Bayesian network's state-space-explosion with an aggregation strategy taken from multigrid context and vectorised GPU acceleration. Part of this work showcases magnitudes of processing acceleration, particularly in high dimensions. We even achieve near real-time runtime for currently impractical applications. At the same time, we demonstrate how such contributions could be abused to de-anonymise Kristine A. and Cameron R. in a public Twitter dataset addressing the US Presidential Election 2020. Finally, this work contributes, implements, and evaluates an extended and generalised version of the novel syntactic anonymisation methodology, attribute compartmentation. Attribute compartmentation promises sanitised datasets without remaining quasi-identifiers while minimising information loss. To prove its functionality in the real world, we partner with digital health experts to conduct a medical use case study. As part of the experiments, we illustrate that attribute compartmentation is suitable for everyday use and, as a positive side effect, even circumvents a common domain issue of base rate neglect.}, language = {en} } @phdthesis{Koehler2024, author = {K{\"o}hler, Wolfgang}, title = {Challenges of efficient and compliant data processing}, doi = {10.25932/publishup-62784}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-627843}, school = {Universit{\"a}t Potsdam}, pages = {195}, year = {2024}, abstract = {Die fortschreitende Digitalisierung ver{\"a}ndert die Gesellschaft und hat weitreichende Auswirkungen auf Menschen und Unternehmen. Grundlegend f{\"u}r diese Ver{\"a}nderungen sind die neuen technologischen M{\"o}glichkeiten, Daten in immer gr{\"o}ßerem Umfang und f{\"u}r vielf{\"a}ltige neue Zwecke zu verarbeiten. Von besonderer Bedeutung ist dabei die Verf{\"u}gbarkeit großer und qualitativ hochwertiger Datens{\"a}tze, insbesondere auf Basis personenbezogener Daten. Sie werden entweder zur Verbesserung der Produktivit{\"a}t, Qualit{\"a}t und Individualit{\"a}t von Produkten und Dienstleistungen oder gar zur Entwicklung neuartiger Dienstleistungen verwendet. Heute wird das Nutzerverhalten, trotz weltweit steigender gesetzlicher Anforderungen an den Schutz personenbezogener Daten, aktiver und umfassender verfolgt als je zuvor. Dies wirft vermehrt ethische, moralische und gesellschaftliche Fragen auf, die nicht zuletzt durch popul{\"a}re F{\"a}lle des Datenmissbrauchs in den Vordergrund der politischen Debatte ger{\"u}ckt sind. Angesichts dieses Diskurses und der gesetzlichen Anforderungen muss heutiges Datenmanagement drei Bedingungen erf{\"u}llen: Erstens die Legalit{\"a}t bzw. Gesetzeskonformit{\"a}t der Nutzung, zweitens die ethische Legitimit{\"a}t. Drittens sollte die Datennutzung aus betriebswirtschaftlicher Sicht wertsch{\"o}pfend sein. Im Rahmen dieser Bedingungen verfolgt die vorliegende kumulative Dissertation vier Forschungsziele mit dem Fokus, ein besseres Verst{\"a}ndnis (1) der Herausforderungen bei der Umsetzung von Gesetzen zum Schutz von Privatsph{\"a}re, (2) der Faktoren, die die Bereitschaft der Kunden zur Weitergabe pers{\"o}nlicher Daten beeinflussen, (3) der Rolle des Datenschutzes f{\"u}r das digitale Unternehmertum und (4) der interdisziplin{\"a}ren wissenschaftlichen Bedeutung, deren Entwicklung und Zusammenh{\"a}nge zu erlangen.}, language = {en} }