{"@context":{"@vocab":"https://cir.nii.ac.jp/schema/1.0/","rdfs":"http://www.w3.org/2000/01/rdf-schema#","dc":"http://purl.org/dc/elements/1.1/","dcterms":"http://purl.org/dc/terms/","foaf":"http://xmlns.com/foaf/0.1/","prism":"http://prismstandard.org/namespaces/basic/2.0/","cinii":"http://ci.nii.ac.jp/ns/1.0/","datacite":"https://schema.datacite.org/meta/kernel-4/","ndl":"http://ndl.go.jp/dcndl/terms/","jpcoar":"https://github.com/JPCOAR/schema/blob/master/2.0/"},"@id":"https://cir.nii.ac.jp/crid/1363388844796044544.json","@type":"Article","productIdentifier":[{"identifier":{"@type":"DOI","@value":"10.1145/568522.568525"}},{"identifier":{"@type":"URI","@value":"https://dl.acm.org/doi/10.1145/568522.568525"}},{"identifier":{"@type":"URI","@value":"https://dl.acm.org/doi/pdf/10.1145/568522.568525"}},{"identifier":{"@type":"NAID","@value":"80015493653"}}],"dc:title":[{"@value":"A survey of rollback-recovery protocols in message-passing systems"}],"description":[{"type":"abstract","notation":[{"@value":"<jats:p>This survey covers rollback-recovery techniques that do not require special language constructs. In the first part of the survey we classify rollback-recovery protocols into<jats:italic>checkpoint-based</jats:italic>and<jats:italic>log-based.</jats:italic><jats:italic>Checkpoint-based</jats:italic>protocols rely solely on checkpointing for system state restoration. Checkpointing can be coordinated, uncoordinated, or communication-induced.<jats:italic>Log-based</jats:italic>protocols combine checkpointing with logging of nondeterministic events, encoded in tuples called<jats:italic>determinants</jats:italic>. Depending on how determinants are logged, log-based protocols can be pessimistic, optimistic, or causal. Throughout the survey, we highlight the research issues that are at the core of rollback-recovery and present the solutions that currently address them. We also compare the performance of different rollback-recovery protocols with respect to a series of desirable properties and discuss the issues that arise in the practical implementations of these protocols.</jats:p>"}]}],"creator":[{"@id":"https://cir.nii.ac.jp/crid/1383388844796044416","@type":"Researcher","foaf:name":[{"@value":"E. N. (Mootaz) Elnozahy"}],"jpcoar:affiliationName":[{"@value":"IBM Research, Austin, TX"}]},{"@id":"https://cir.nii.ac.jp/crid/1383388844796044544","@type":"Researcher","foaf:name":[{"@value":"Lorenzo Alvisi"}],"jpcoar:affiliationName":[{"@value":"The University of Texas at Austin, Austin, TX"}]},{"@id":"https://cir.nii.ac.jp/crid/1383388844796044417","@type":"Researcher","foaf:name":[{"@value":"Yi-Min Wang"}],"jpcoar:affiliationName":[{"@value":"Microsoft Research, Redmond, WA"}]},{"@id":"https://cir.nii.ac.jp/crid/1383388844796044418","@type":"Researcher","foaf:name":[{"@value":"David B. Johnson"}],"jpcoar:affiliationName":[{"@value":"Rice University, Houston, TX"}]}],"publication":{"publicationIdentifier":[{"@type":"PISSN","@value":"03600300"},{"@type":"EISSN","@value":"15577341"}],"prism:publicationName":[{"@value":"ACM Computing Surveys"}],"dc:publisher":[{"@value":"Association for Computing Machinery (ACM)"}],"prism:publicationDate":"2002-09","prism:volume":"34","prism:number":"3","prism:startingPage":"375","prism:endingPage":"408"},"reviewed":"false","dc:rights":["https://www.acm.org/publications/policies/copyright_policy#Background"],"url":[{"@id":"https://dl.acm.org/doi/10.1145/568522.568525"},{"@id":"https://dl.acm.org/doi/pdf/10.1145/568522.568525"}],"createdAt":"2002-10-07","modifiedAt":"2025-06-18","relatedProduct":[{"@id":"https://cir.nii.ac.jp/crid/1050001337881305728","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"アドホックネットワークのための中継ログ手法を用いたチェックポイントプロトコル"},{"@language":"en","@value":"Checkpoint Protocol with Intermediate Message Logging for Mobile Ad-hoc Networks"},{"@language":"ja-Kana","@value":"アドホック ネットワーク ノ タメノ チュウケイ ログ シュホウ オ モチイタ チェックポイント プロトコル"}]},{"@id":"https://cir.nii.ac.jp/crid/1050282812869883648","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"単一システムイメージを提供するための仮想マシンモニタ"},{"@language":"en","@value":"A Virtual Machine Monitor for Providing a Single System Image"},{"@language":"ja-Kana","@value":"タンイツ システム イメージ オ テイキョウ スル タメ ノ カソウ マシンモニタ"}]},{"@id":"https://cir.nii.ac.jp/crid/1360004230547211008","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"Automatic Parameter Tuning of Hierarchical Incremental Checkpointing"}]},{"@id":"https://cir.nii.ac.jp/crid/1360004235162084864","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"A Cooperative Partial Snapshot Algorithm for Checkpoint-Rollback Recovery of Large-Scale and Dynamic Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1360285710142112128","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"A User-Level InfiniBand-Based File System and Checkpoint Strategy for Burst Buffers"}]},{"@id":"https://cir.nii.ac.jp/crid/1360290617586193280","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"Reversible CSP Computations"}]},{"@id":"https://cir.nii.ac.jp/crid/1360572092563469568","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"ReverCSP: Time-Travelling in CSP Computations"}]},{"@id":"https://cir.nii.ac.jp/crid/1360848660108092544","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"Hierarchical Clustering Strategies for Fault Tolerance in Large Scale HPC Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1360848660354416768","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"Modular Software Model Checking for Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1361975839485556736","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"A cooperative partial snapshot algorithm for checkpoint‐rollback recovery of large‐scale and dynamic distributed systems and experimental evaluations"}]},{"@id":"https://cir.nii.ac.jp/crid/1362260173461442560","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@value":"A proposal and evaluation of a coordinated checkpointing technique using incremental snapshots"}]},{"@id":"https://cir.nii.ac.jp/crid/1390001204378201344","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"A Scalable Communication-Induced Checkpointing Algorithm for Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390001204378266240","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Checkpoint Time Arrangement Rotation in Hybrid State Saving with a Limited Number of Periodical Checkpoints"}]},{"@id":"https://cir.nii.ac.jp/crid/1390001204378430464","@type":"Article","relationType":["isReferencedBy","isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"WBC-ALC: A Weak Blocking Coordinated Application-Level Checkpointing for MPI Programs"}]},{"@id":"https://cir.nii.ac.jp/crid/1390001204379698816","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Understanding the Impact of BPRAM on Incremental Checkpoint"}]},{"@id":"https://cir.nii.ac.jp/crid/1390001205750876928","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Composing resilience techniques: ABFT, periodic and incremental checkpointing"}]},{"@id":"https://cir.nii.ac.jp/crid/1390007757214556032","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Enhanced Sender-Based Message Logging for Reducing Forced Checkpointing Overhead in Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679354345472","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"A Checkpointing Method with Small Checkpoint Latency"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679355517184","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Energy-Performance Modeling of Speculative Checkpointing for Exascale Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679355612288","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"On Reducing Rollback Propagation Effect of Optimistic Message Logging for Group-Based Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679355811456","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"A Concurrent Partial Snapshot Algorithm for Large-Scale and Dynamic Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679355877888","@type":"Article","relationType":["isReferencedBy","isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Lightweight Consistent Recovery Algorithm for Sender-Based Message Logging in Distributed Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282679358432384","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"A Tree-Based Checkpointing Architecture for the Dependability of FPGA Computing"}]},{"@id":"https://cir.nii.ac.jp/crid/1390282680268781696","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Robust System Design"}]},{"@id":"https://cir.nii.ac.jp/crid/1390290769928487936","@type":"Article","relationType":["isReferencedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Task-Level Resilience: Checkpointing vs. Supervision"}]},{"@id":"https://cir.nii.ac.jp/crid/1520009408451466112","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"Stable Storage for Wireless Multihop Access Networks"},{"@language":"ja-Kana","@value":"Stable Storage for Wireless Multihop Access Networks"}]},{"@id":"https://cir.nii.ac.jp/crid/1520009408762521984","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"フォールト/リカバリモデルを考慮した耐故障性をもつMPIフレームワークCuckoo FTMPIの提案と評価"},{"@language":"ja-Kana","@value":"フォールト リカバリ モデル オ コウリョ シタ タイコショウセイ オ モツ MPI フレームワーク Cuckoo FTMPI ノ テイアン ト ヒョウカ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520009408918292608","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"ハイブリッドチェックポインティングの回復能力の評価"},{"@language":"ja-Kana","@value":"ハイブリッド チェックポインティング ノ カイフク ノウリョク ノ ヒョウカ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520009409420177408","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"Speculativeチェックポインティングの設計と実装"},{"@language":"ja-Kana","@value":"Speculative チェックポインティング ノ セッケイ ト ジッソウ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520290884380452608","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"空間的・時間的な故障率の変動を考慮したチェックポインティング手法の初期検討"},{"@language":"ja-Kana","@value":"クウカンテキ ジカンテキ ナ コショウリツ ノ ヘンドウ オ コウリョ シタ チェックポインティング シュホウ ノ ショキ ケントウ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520290884397560704","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"分散チェックポインティングの評価手法に関する検討"},{"@language":"ja-Kana","@value":"ブンサン チェックポインティング ノ ヒョウカ シュホウ ニ カンスル ケントウ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520290884466038400","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"多重故障を考慮した計算機クラスタ向けSkewed Checkpointingの検討"},{"@language":"ja-Kana","@value":"タジュウ コショウ オ コウリョ シタ ケイサンキ クラスタ ムケ Skewed Checkpointing ノ ケントウ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520290884491543040","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"フォールト/リカバリモデルを考慮した耐故障性をもつMPIフレームワークABARISの提案と評価"},{"@language":"ja-Kana","@value":"フォールト リカバリ モデル オ コウリョシタ タイコショウセイ オ モツ MPI フレームワーク ABARIS ノ テイアン ト ヒョウカ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520572358371339648","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"ローテーショナル・ワンミラー非連携チェックポインティングのリカバラビリティ"},{"@language":"ja-Kana","@value":"ローテーショナル ワンミラー ヒレンケイ チェックポインティング ノ リカバラビリティ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520572358733453568","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"空間的に故障率が異なる計算機クラスタシステムにおけるチェックポインティング"},{"@language":"ja-Kana","@value":"クウカンテキ ニ コショウリツ ガ コトナル ケイサンキ クラスタ システム ニ オケル チェックポインティング"}]},{"@id":"https://cir.nii.ac.jp/crid/1520572358941108096","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"高信頼HPCクラスタのためのチェックポインティング高速化の検討"},{"@language":"ja-Kana","@value":"コウシンライ HPC クラスタ ノ タメ ノ チェックポインティング コウソクカ ノ ケントウ"}]},{"@id":"https://cir.nii.ac.jp/crid/1520853834230654976","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@value":"差分スナップショットを伴う連携チェックポインティングの提案と評価"},{"@language":"ja-Kana","@value":"サブン スナップショット オ トモナウ レンケイ チェックポインティング ノ テイアン ト ヒョウカ"}]},{"@id":"https://cir.nii.ac.jp/crid/1570009752027867264","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"無線マルチホップアクセスネットワークにおける安定記憶実現手法"},{"@language":"en","@value":"Stable Storage for Wireless Multihop Access Networks"}]},{"@id":"https://cir.nii.ac.jp/crid/1570009752557674240","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Evaluation of Checkpointing Mechanism on SCore Cluster System"}]},{"@id":"https://cir.nii.ac.jp/crid/1570009752661532928","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Analytical Model on Hybrid State Saving with a Limited Number of Checkpoints and Bound Rollbacks"}]},{"@id":"https://cir.nii.ac.jp/crid/1571417127440779264","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"An Efficient Centralized Algorithm Ensuring Consistent Recovery in Causal Message Logging with Independent Checkpointing"}]},{"@id":"https://cir.nii.ac.jp/crid/1571417127529092992","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"MPI-CUDA Applications Checkpointing"}]},{"@id":"https://cir.nii.ac.jp/crid/1571698602417665024","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"MPICH-GF : Transparent Checkpointing and Rollback-Recovery for Grid-Enabled MPI Processes"}]},{"@id":"https://cir.nii.ac.jp/crid/1572824501795440256","@type":"Article","relationType":["isCitedBy"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Controller/Precompiler for Portable Checkpointing"}]}],"dataSourceIdentifier":[{"@type":"CROSSREF","@value":"10.1145/568522.568525"},{"@type":"CIA","@value":"80015493653"},{"@type":"CROSSREF","@value":"10.1007/978-3-319-17353-5_25_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e96.d.141_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e96.d.663_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e96.d.886_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.2197/ipsjtsldm.4.2_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.2017pap0002_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e95.d.786_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e94.d.1712_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1109/tse.2013.49_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e96.d.2473_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.e97.d.65_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1007/978-3-030-52482-1_14_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1109/tpds.2021.3051747_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.2017rcp0010_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1587/transinf.2021edl8027_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.15803/ijnc.12.1_47_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1109/ccgrid.2014.24_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1109/cluster.2012.71_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1109/candarw.2018.00060_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.15803/ijnc.5.1_2_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1002/cpe.5647_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"},{"@type":"CROSSREF","@value":"10.1002/ecjc.20296_references_DOI_1lohnBv26W9JQ79BxwTgUrAKNXe"}]}