@article{LuckowSchnor2005, author = {Luckow, Andr{\´e} and Schnor, Bettina}, title = {Migol : a Fault-Tolerant Service Framework for MPI Applications in the Grid}, isbn = {978-3-540-29009-4}, year = {2005}, abstract = {In a distributed, inherently dynamic Grid environment the reliability of individual resources cannot be guaranteed. The more resources and components are involved the more error-prone is the system. Therefore, it is important to enhance the dependability of the system with fault-tolerance mechanisms. In this paper, we present Migol, a fault-tolerant, self-healing Grid service infrastructure for MPI applications. The benefit of the Grid is that in case of a failure an application may be migrated and restarted from a checkpoint file on another site. This approach requires a service infrastructure which handles the necessary activities transparently for an application. But any migration framework cannot support fault-tolerant applications, if it is not fault-tolerant itself.}, language = {en} } @article{HallamaLuckowSchnor2006, author = {Hallama, Nicole and Luckow, Andr{\´e} and Schnor, Bettina}, title = {Grid Security for Fault Tolerant Grid Applications}, isbn = {978-1-880843-60-4}, year = {2006}, language = {en} } @article{LuckowSchnor2006, author = {Luckow, Andr{\´e} and Schnor, Bettina}, title = {Migol : a Fault Tolerant Service Framework for Grid Computing : Evolution to WSRF (2006)}, year = {2006}, language = {en} } @article{JeskeLuckowSchnor2007, author = {Jeske, Janin and Luckow, Andr{\´e} and Schnor, Bettina}, title = {Reservation-based Resource-Brokering for Grid Computing}, year = {2007}, abstract = {In this paper we present the design and implementation of the Migol brokering framework. Migol is a Grid middleware, which addresses the fault-tolerance of long-running and compute-intensive applications. The framework supports e. g. the automatic and transparent recovery respectively the migration of applications. Another core feature of Migol is the discovery, selection, and allocation of resources using advance reservation. Grid broker systems can significantly benefit from advance reservation. With advance reservation brokers and users can obtain execution guarantees from local resource management systems (LRM) without requiring detailed knowledge of current and future workloads or of the resource owner's policies. Migol's Advance Reservation Service (ARS) provides an adapter layer for reservation capabilities of different LRMs, which is currently not provided by existing Grid middleware platforms. Further, we propose a shortest expected delay (SED) strategy for scheduling of advance reservations within the Job Broker Service. SED needs information about the earliest start time of an application. This is currently not supported by LRMs. We added this feature for PBSPro. Migol depends on Globus and its security infrastructure. Our performance experiments show the substantial overhead of this serviceoriented approach.}, language = {en} } @article{LuckowSchnor2008, author = {Luckow, Andr{\´e} and Schnor, Bettina}, title = {Migol : a fault-tolerant service framework for MPI applications in the grid}, doi = {10.1016/j.future.2007.03.007}, year = {2008}, abstract = {Especially for sciences the provision of massive parallel CPU capacity is one of the most attractive features of a grid. A major challenge in a distributed, inherently dynamic grid is fault tolerance. The more resources and components involved, the more complicated and error-prone becomes the system. In a grid with potentially thousands of machines connected to each other the reliability of individual resources cannot be guaranteed.The benefit of the grid is that in case of a failure ail application may be migrated and restarted from a checkpoint file on another site. This approach requires a service infrastructure which handles the necessary activities transparently. In this article, we present Migol, a fault-tolerant and self-healing grid middleware for MPI applications. Migol is based on open standards and extends the services of the Globus toolkit to support the fault tolerance of grid applications.Further, the Migol framework itself is designed with special focus on fault tolerance. For example, Migol eplicates ritical services and uses a ring-based replication protocol to achieve data consistency. (c) 2007 Elsevier B.V. All rights reserved.}, language = {en} } @phdthesis{Luckow2009, author = {Luckow, Andr{\´e}}, title = {A dependable middleware for enhancing the fault tolerance of distributed computations in grid environments}, address = {Potsdam}, pages = {235 S.}, year = {2009}, language = {en} }