@phdthesis{Jurish2011, author = {Jurish, Bryan}, title = {Finite-state canonicalization techniques for historical German}, url = {http://nbn-resolving.de/urn:nbn:de:kobv:517-opus-55789}, school = {Universit{\"a}t Potsdam}, year = {2011}, abstract = {This work addresses issues in the automatic preprocessing of historical German input text for use by conventional natural language processing techniques. Conventional techniques cannot adequately account for historical input text due to conventional tools' reliance on a fixed application-specific lexicon keyed by contemporary orthographic surface form on the one hand, and the lack of consistent orthographic conventions in historical input text on the other. Historical spelling variation is treated here as an error-correction problem or "canonicalization" task: an attempt to automatically assign each (historical) input word a unique extant canonical cognate, thus allowing direct application-specific processing (tagging, parsing, etc.) of the returned canonical forms without need for any additional application-specific modifications. In the course of the work, various methods for automatic canonicalization are investigated and empirically evaluated, including conflation by phonetic identity, conflation by lemma instantiation heuristics, canonicalization by weighted finite-state rewrite cascade, and token-wise disambiguation by a dynamic Hidden Markov Model.}, language = {en} }