Skip to content

Commit b2395be

Browse files
fraser-iohkgeo2a
authored andcommitted
add sanity checks for weird (i.e. likely-broken) SnapshotPolicyArgs configurations
1 parent b1e2b2b commit b2395be

6 files changed

Lines changed: 342 additions & 1 deletion

File tree

ouroboros-consensus-diffusion/src/ouroboros-consensus-diffusion/Ouroboros/Consensus/Node.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,10 @@ runWith RunNodeArgs{..} encAddrNtN decAddrNtN LowLevelRunNodeArgs{..} =
525525

526526
forM_ (sanityCheckConfig cfg) $ \issue ->
527527
traceWith (consensusSanityCheckTracer rnTraceConsensus) issue
528+
let snapshotPolicyArgs =
529+
lgrSnapshotPolicyArgs $ ChainDB.cdbLgrDbArgs llrnChainDbArgsDefaults
530+
forM_ (sanityCheckSnapshotPolicyArgs snapshotPolicyArgs) $ \issue ->
531+
traceWith (consensusSanityCheckTracer rnTraceConsensus) issue
528532

529533
(chainDB, finalArgs) <-
530534
openChainDB

ouroboros-consensus.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,7 @@ test-suite storage-test
810810
Test.Ouroboros.Storage.ImmutableDB.StateMachine
811811
Test.Ouroboros.Storage.LedgerDB
812812
Test.Ouroboros.Storage.LedgerDB.Serialisation
813+
Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck
813814
Test.Ouroboros.Storage.LedgerDB.Snapshots
814815
Test.Ouroboros.Storage.LedgerDB.StateMachine
815816
Test.Ouroboros.Storage.LedgerDB.StateMachine.TestBlock

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Block/SupportsSanityCheck.hs

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,52 @@ import Control.Exception
2121
import Data.List.NonEmpty (NonEmpty (..))
2222
import qualified Data.List.NonEmpty as NonEmpty
2323
import Data.Maybe (catMaybes)
24+
import Data.Time.Clock (DiffTime)
25+
import Data.Word (Word64)
2426
import Ouroboros.Consensus.Config (TopLevelConfig)
2527
import Ouroboros.Consensus.Config.SecurityParam
2628

27-
-- | An issue found in the 'TopLevelConfig' for a block. See 'displayException'
29+
-- | An issue found in the consensus configuration. See 'displayException'
2830
-- for human-readable descriptions of each of these cases, especially when
2931
-- presenting these to users.
3032
data SanityCheckIssue
3133
= -- | Configuration contains multiple security parameters. This may cause
3234
-- strange behaviour around era boundaries.
3335
InconsistentSecurityParam (NonEmpty SecurityParam)
36+
| -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is greater than
37+
-- 'maximumDelay'. The random snapshot delay will be sampled from an
38+
-- inverted range, which is almost certainly a misconfiguration.
39+
SnapshotDelayRangeInverted
40+
-- | The configured minimumDelay (the larger value)
41+
!DiffTime
42+
-- | The configured maximumDelay (the smaller value)
43+
!DiffTime
44+
| -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is negative.
45+
-- A negative delay has no meaningful interpretation.
46+
SnapshotDelayRangeNegativeMinimum
47+
-- | The negative minimumDelay
48+
!DiffTime
49+
| -- | The configured 'sfaRateLimit' is non-positive, which disables snapshot
50+
-- rate limiting entirely. Without a rate limit, snapshots may be taken
51+
-- very frequently during bulk sync, causing excessive disk I/O.
52+
SnapshotRateLimitDisabled
53+
| -- | The configured 'sfaRateLimit' exceeds 24 hours. At steady state, the
54+
-- node may go more than a day between snapshots, significantly increasing
55+
-- replay time after an unclean restart.
56+
SnapshotRateLimitSuspiciouslyLarge
57+
-- | The configured rate limit
58+
!DiffTime
59+
| -- | The configured number of on-disk snapshots to keep is zero. Snapshots
60+
-- will be written to disk and then immediately deleted, leaving nothing
61+
-- for crash recovery. The node will have to replay from genesis on every
62+
-- unclean restart.
63+
SnapshotNumZero
64+
| -- | The configured snapshot interval does not divide 432000 (the Cardano
65+
-- mainnet epoch length in slots). Snapshots will not land on epoch
66+
-- boundaries, breaking Mithril compatibility.
67+
SnapshotIntervalNotDivisorOfEpoch
68+
-- | The configured interval in slots
69+
!Word64
3470
deriving (Show, Eq)
3571

3672
instance Exception SanityCheckIssue where
@@ -42,6 +78,61 @@ instance Exception SanityCheckIssue where
4278
, "eras of a HardForkBlock: "
4379
, show (NonEmpty.toList ks)
4480
]
81+
SnapshotDelayRangeInverted mn mx ->
82+
mconcat
83+
[ "SnapshotDelayRangeInverted: "
84+
, "The configured snapshot delay range has minimumDelay ("
85+
, show mn
86+
, ") greater than maximumDelay ("
87+
, show mx
88+
, "). The random snapshot delay will be sampled from an inverted range. "
89+
, "Please ensure minimumDelay <= maximumDelay in sfaDelaySnapshotRange."
90+
]
91+
SnapshotDelayRangeNegativeMinimum mn ->
92+
mconcat
93+
[ "SnapshotDelayRangeNegativeMinimum: "
94+
, "The configured snapshot delay range has a negative minimumDelay: "
95+
, show mn
96+
, ". A negative delay has no meaningful interpretation. "
97+
, "Please set minimumDelay to a non-negative value in sfaDelaySnapshotRange."
98+
]
99+
SnapshotRateLimitDisabled ->
100+
mconcat
101+
[ "SnapshotRateLimitDisabled: "
102+
, "The configured sfaRateLimit is non-positive, which disables snapshot "
103+
, "rate limiting entirely. Without a rate limit, snapshots may be taken "
104+
, "very frequently during bulk sync, causing excessive disk I/O. "
105+
, "The default rate limit is 10 minutes."
106+
]
107+
SnapshotRateLimitSuspiciouslyLarge rl ->
108+
mconcat
109+
[ "SnapshotRateLimitSuspiciouslyLarge: "
110+
, "The configured sfaRateLimit ("
111+
, show rl
112+
, ") exceeds 24 hours. At steady state, the node may go more than a day "
113+
, "between snapshots, significantly increasing replay time after an "
114+
, "unclean restart. The default rate limit is 10 minutes."
115+
]
116+
SnapshotNumZero ->
117+
mconcat
118+
[ "SnapshotNumZero: "
119+
, "The configured number of on-disk snapshots to keep (spaNum) is 0. "
120+
, "Snapshots will be written to disk and immediately deleted, leaving "
121+
, "nothing for crash recovery. The node will have to replay the chain "
122+
, "from genesis on every unclean restart. "
123+
, "Consider setting spaNum to at least 2 (the default)."
124+
]
125+
SnapshotIntervalNotDivisorOfEpoch interval ->
126+
mconcat
127+
[ "SnapshotIntervalNotDivisorOfEpoch: "
128+
, "The configured sfaInterval ("
129+
, show interval
130+
, " slots) does not evenly divide the Cardano mainnet epoch length "
131+
, "(432000 slots). Snapshots will not consistently land on epoch "
132+
, "boundaries, which breaks Mithril compatibility. "
133+
, "Consider using an interval that divides 432000 evenly, "
134+
, "such as 4320 (the default, = 2k for k=2160)."
135+
]
45136

46137
-- | 'BlockSupportsSanityCheck' provides evidence that a block can be sanity
47138
-- checked for common issues on node startup. 'sanityCheckConfig', which runs

ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/Snapshots.hs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ module Ouroboros.Consensus.Storage.LedgerDB.Snapshots
8282
, SnapshotFrequency (..)
8383
, SnapshotFrequencyArgs (..)
8484
, defaultSnapshotPolicy
85+
, mithrilEpochSize
86+
, sanityCheckSnapshotPolicyArgs
8587
, pattern DoDiskSnapshotChecksum
8688
, pattern NoDoDiskSnapshotChecksum
8789

@@ -706,6 +708,59 @@ defaultSnapshotPolicy (SecurityParam k) args =
706708
-- Most relevant during syncing.
707709
defRateLimit = secondsToDiffTime $ 10 * 60
708710

711+
-- | The Cardano mainnet epoch length in slots, used as the reference for the
712+
-- Mithril snapshot compatibility check in 'sanityCheckSnapshotPolicyArgs'.
713+
-- Mithril requires a ledger snapshot at each epoch boundary; for snapshots to
714+
-- land on every epoch boundary, 'sfaInterval' must divide this value evenly.
715+
mithrilEpochSize :: Word64
716+
mithrilEpochSize = 432000
717+
718+
-- | Check a 'SnapshotPolicyArgs' for suspicious configurations and return a
719+
-- (possibly empty) list of 'SanityCheckIssue's describing any problems found.
720+
--
721+
-- Only 'Override' values are checked — 'UseDefault' values are known-good and
722+
-- are never flagged. Checks that are specific to 'SnapshotFrequency' are
723+
-- skipped entirely when 'spaFrequency' is 'DisableSnapshots'.
724+
sanityCheckSnapshotPolicyArgs :: SnapshotPolicyArgs -> [SanityCheckIssue]
725+
sanityCheckSnapshotPolicyArgs SnapshotPolicyArgs{spaFrequency, spaNum} =
726+
catMaybes $
727+
checkNumZero spaNum
728+
: case spaFrequency of
729+
DisableSnapshots -> []
730+
SnapshotFrequency sfa -> checkFrequencyArgs sfa
731+
where
732+
checkNumZero (Override 0) = Just SnapshotNumZero
733+
checkNumZero _ = Nothing
734+
735+
checkFrequencyArgs SnapshotFrequencyArgs{sfaDelaySnapshotRange, sfaRateLimit, sfaInterval} =
736+
[ checkDelayRange sfaDelaySnapshotRange
737+
, checkRateLimitDisabled sfaRateLimit
738+
, checkRateLimitLarge sfaRateLimit
739+
, checkMithrilDivisibility sfaInterval
740+
]
741+
742+
checkDelayRange UseDefault = Nothing
743+
checkDelayRange (Override (SnapshotDelayRange mn mx))
744+
| mn < 0 = Just (SnapshotDelayRangeNegativeMinimum mn)
745+
| mn > mx = Just (SnapshotDelayRangeInverted mn mx)
746+
| otherwise = Nothing
747+
748+
checkRateLimitDisabled UseDefault = Nothing
749+
checkRateLimitDisabled (Override rl)
750+
| rl <= 0 = Just SnapshotRateLimitDisabled
751+
| otherwise = Nothing
752+
753+
checkRateLimitLarge UseDefault = Nothing
754+
checkRateLimitLarge (Override rl)
755+
| rl > 86400 = Just (SnapshotRateLimitSuspiciouslyLarge rl)
756+
| otherwise = Nothing
757+
758+
checkMithrilDivisibility UseDefault = Nothing
759+
checkMithrilDivisibility (Override interval)
760+
| mithrilEpochSize `mod` unNonZero interval /= 0 =
761+
Just (SnapshotIntervalNotDivisorOfEpoch (unNonZero interval))
762+
| otherwise = Nothing
763+
709764
{-------------------------------------------------------------------------------
710765
Tracing snapshot events
711766
-------------------------------------------------------------------------------}

ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
module Test.Ouroboros.Storage.LedgerDB (tests) where
77

88
import qualified Test.Ouroboros.Storage.LedgerDB.Serialisation as Serialisation
9+
import qualified Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck as SnapshotPolicySanityCheck
910
import qualified Test.Ouroboros.Storage.LedgerDB.Snapshots as Snapshots
1011
import qualified Test.Ouroboros.Storage.LedgerDB.StateMachine as StateMachine
1112
import qualified Test.Ouroboros.Storage.LedgerDB.V1.BackingStore as BackingStore
@@ -24,6 +25,7 @@ tests =
2425
, -- Independent of the LedgerDB implementation
2526
Serialisation.tests
2627
, Snapshots.tests
28+
, SnapshotPolicySanityCheck.tests
2729
, -- Tests both V1 and V2
2830
StateMachine.tests
2931
]

0 commit comments

Comments
 (0)