@phdthesis{Koumarelas2020, author = {Koumarelas, Ioannis}, title = {Data preparation and domain-agnostic duplicate detection}, doi = {10.25932/publishup-48913}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-489131}, school = {Universit{\"a}t Potsdam}, pages = {x, 97}, year = {2020}, abstract = {Successfully completing any data science project demands careful consideration across its whole process. Although the focus is often put on later phases of the process, in practice, experts spend more time in earlier phases, preparing data, to make them consistent with the systems' requirements or to improve their models' accuracies. Duplicate detection is typically applied during the data cleaning phase, which is dedicated to removing data inconsistencies and improving the overall quality and usability of data. While data cleaning involves a plethora of approaches to perform specific operations, such as schema alignment and data normalization, the task of detecting and removing duplicate records is particularly challenging. Duplicates arise when multiple records representing the same entities exist in a database. Due to numerous reasons, spanning from simple typographical errors to different schemas and formats of integrated databases. Keeping a database free of duplicates is crucial for most use-cases, as their existence causes false negatives and false positives when matching queries against it. These two data quality issues have negative implications for tasks, such as hotel booking, where users may erroneously select a wrong hotel, or parcel delivery, where a parcel can get delivered to the wrong address. Identifying the variety of possible data issues to eliminate duplicates demands sophisticated approaches. While research in duplicate detection is well-established and covers different aspects of both efficiency and effectiveness, our work in this thesis focuses on the latter. We propose novel approaches to improve data quality before duplicate detection takes place and apply the latter in datasets even when prior labeling is not available. Our experiments show that improving data quality upfront can increase duplicate classification results by up to 19\%. To this end, we propose two novel pipelines that select and apply generic as well as address-specific data preparation steps with the purpose of maximizing the success of duplicate detection. Generic data preparation, such as the removal of special characters, can be applied to any relation with alphanumeric attributes. When applied, data preparation steps are selected only for attributes where there are positive effects on pair similarities, which indirectly affect classification, or on classification directly. Our work on addresses is twofold; first, we consider more domain-specific approaches to improve the quality of values, and, second, we experiment with known and modified versions of similarity measures to select the most appropriate per address attribute, e.g., city or country. To facilitate duplicate detection in applications where gold standard annotations are not available and obtaining them is not possible or too expensive, we propose MDedup. MDedup is a novel, rule-based, and fully automatic duplicate detection approach that is based on matching dependencies. These dependencies can be used to detect duplicates and can be discovered using state-of-the-art algorithms efficiently and without any prior labeling. MDedup uses two pipelines to first train on datasets with known labels, learning to identify useful matching dependencies, and then be applied on unseen datasets, regardless of any existing gold standard. Finally, our work is accompanied by open source code to enable repeatability of our research results and application of our approaches to other datasets.}, language = {en} } @masterthesis{Eggers2023, type = {Bachelor Thesis}, author = {Eggers, Nele}, title = {Properties of Arctic aerosol in the transition between Arctic haze to summer season derived by lidar}, doi = {10.25932/publishup-61943}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-619438}, school = {Universit{\"a}t Potsdam}, pages = {x, 63}, year = {2023}, abstract = {During the Arctic haze period, the Arctic troposphere consists of larger, yet fewer, aerosol particles than during the summer (Tunved et al., 2013; Quinn et al., 2007). Interannual variability (Graßl and Ritter, 2019; Rinke et al., 2004), as well as unknown origins (Stock et al., 2014) and properties of aerosol complicate modeling these annual aerosol cycles. This thesis investigates the modification of the microphysical properties of Arctic aerosols in the transition from Arctic haze to the summer season. Therefore, lidar measurements of Ny-{\AA}lesund from April 2021 to the end of July 2021 are evaluated based on the aerosols' optical properties. An overview of those properties will be provided. Furthermore, parallel radiosonde data is considered for indication of hygroscopic growth. The annual aerosol cycle in 2021 differs from expectations based on previous studies from Tunved et al. (2013) and Quinn et al. (2007). Developments of backscatter, extinction, aerosol depolarisation, lidar ratio and color ratio show a return of the Arctic haze in May. The haze had already reduced in April, but regrew afterwards. The average Arctic aerosol displays hygroscopic behaviour, meaning growth due to water uptake. To determine such a behaviour is generally laborious because various meteorological circumstances need to be considered. Two case studies provide further information on these possible events. In particular, a day with a rare ice cloud and with highly variable water cloud layers is observed.}, language = {en} } @phdthesis{Nakoudi2021, author = {Nakoudi, Konstantina}, title = {Properties and radiative effect of aerosol and cirrus clouds over the European Arctic}, doi = {10.25932/publishup-53036}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus4-530366}, school = {Universit{\"a}t Potsdam}, pages = {x, 136}, year = {2021}, abstract = {Over the last decades, the rate of near-surface warming in the Arctic is at least double than elsewhere on our planet (Arctic amplification). However, the relative contribution of different feedback processes to Arctic amplification is a topic of ongoing research, including the role of aerosol and clouds. Lidar systems are well-suited for the investigation of aerosol and optically-thin clouds as they provide vertically-resolved information on fine temporal scales. Global aerosol models fail to converge on the sign of the Arctic aerosol radiative effect (ARE). In the first part of this work, the optical and microphysical properties of Arctic aerosol were characterized at case study level in order to assess the short-wave (SW) ARE. A long-range transport episode was first investigated. Geometrically similar aerosol layers were captured over three locations. Although the aerosol size distribution was different between Fram Strait(bi-modal) and Ny-{\AA}lesund (fine mono-modal), the atmospheric column ARE was similar. The latter was related to the domination of accumulation mode aerosol. Over both locations top of the atmosphere (TOA) warming was accompanied by surface cooling. Subsequently, the sensitivity of ARE was investigated with respect to different aerosol and spring-time ambient conditions. A 10\% change in the single-scattering albedo (SSA) induced higher ARE perturbations compared to a 30\% change in the aerosol extinction coefficient. With respect to ambient conditions, the ARETOA was more sensitive to solar elevation changes compared to AREsur f ace. Over dark surfaces the ARE profile was exclusively negative, while over bright surfaces a negative to positive shift occurred above the aerosol layers. Consequently, the sign of ARE can be highly sensitive in spring since this season is characterized by transitional surface albedo conditions. As the inversion of the aerosol microphysics is an ill-posed problem, the inferred aerosol size distribution of a low-tropospheric event was compared to the in-situ measured distribution. Both techniques revealed a bi-modal distribution, with good agreement in the total volume concentration. However, in terms of SSA a disagreement was found, with the lidar inversion indicating highly scattering particles and the in-situ measurements pointing to absorbing particles. The discrepancies could stem from assumptions in the inversion (e.g. wavelength-independent refractive index) and errors in the conversion of the in-situ measured light attenuation into absorption. Another source of discrepancy might be related to an incomplete capture of fine particles in the in-situ sensors. The disagreement in the most critical parameter for the Arctic ARE necessitates further exploration in the frame of aerosol closure experiments. Care must be taken in ARE modelling studies, which may use either the in-situ or lidar-derived SSA as input. Reliable characterization of cirrus geometrical and optical properties is necessary for improving their radiative estimates. In this respect, the detection of sub-visible cirrus is of special importance. The total cloud radiative effect (CRE) can be negatively biased, should only the optically-thin and opaque cirrus contributions are considered. To this end, a cirrus retrieval scheme was developed aiming at increased sensitivity to thin clouds. The cirrus detection was based on the wavelet covariance transform (WCT) method, extended by dynamic thresholds. The dynamic WCT exhibited high sensitivity to faint and thin cirrus layers (less than 200 m) that were partly or completely undetected by the existing static method. The optical characterization scheme extended the Klett-Fernald retrieval by an iterative lidar ratio (LR) determination (constrained Klett). The iterative process was constrained by a reference value, which indicated the aerosol concentration beneath the cirrus cloud. Contrary to existing approaches, the aerosol-free assumption was not adopted, but the aerosol conditions were approximated by an initial guess. The inherent uncertainties of the constrained Klett were higher for optically-thinner cirrus, but an overall good agreement was found with two established retrievals. Additionally, existing approaches, which rely on aerosol-free assumptions, presented increased accuracy when the proposed reference value was adopted. The constrained Klett retrieved reliably the optical properties in all cirrus regimes, including upper sub-visible cirrus with COD down to 0.02. Cirrus is the only cloud type capable of inducing TOA cooling or heating at daytime. Over the Arctic, however, the properties and CRE of cirrus are under-explored. In the final part of this work, long-term cirrus geometrical and optical properties were investigated for the first time over an Arctic site (Ny-{\AA}lesund). To this end, the newly developed retrieval scheme was employed. Cirrus layers over Ny-{\AA}lesund seemed to be more absorbing in the visible spectral region compared to lower latitudes and comprise relatively more spherical ice particles. Such meridional differences could be related to discrepancies in absolute humidity and ice nucleation mechanisms. The COD tended to decline for less spherical and smaller ice particles probably due to reduced water vapor deposition on the particle surface. The cirrus optical properties presented weak dependence on ambient temperature and wind conditions. Over the 10 years of the analysis, no clear temporal trend was found and the seasonal cycle was not pronounced. However, winter cirrus appeared under colder conditions and stronger winds. Moreover, they were optically-thicker, less absorbing and consisted of relatively more spherical ice particles. A positive CREnet was primarily revealed for a broad range of representative cloud properties and ambient conditions. Only for high COD (above 10) and over tundra a negative CREnet was estimated, which did not hold true over snow/ice surfaces. Consequently, the COD in combination with the surface albedo seem to play the most critical role in determining the CRE sign over the high European Arctic.}, language = {en} }