diff --git a/ouroboros-consensus-diffusion/src/ouroboros-consensus-diffusion/Ouroboros/Consensus/Node.hs b/ouroboros-consensus-diffusion/src/ouroboros-consensus-diffusion/Ouroboros/Consensus/Node.hs index f41e0e23bf..cd5d737752 100644 --- a/ouroboros-consensus-diffusion/src/ouroboros-consensus-diffusion/Ouroboros/Consensus/Node.hs +++ b/ouroboros-consensus-diffusion/src/ouroboros-consensus-diffusion/Ouroboros/Consensus/Node.hs @@ -525,6 +525,10 @@ runWith RunNodeArgs{..} encAddrNtN decAddrNtN LowLevelRunNodeArgs{..} = forM_ (sanityCheckConfig cfg) $ \issue -> traceWith (consensusSanityCheckTracer rnTraceConsensus) issue + let snapshotPolicyArgs = + lgrSnapshotPolicyArgs $ ChainDB.cdbLgrDbArgs llrnChainDbArgsDefaults + forM_ (sanityCheckSnapshotPolicyArgs snapshotPolicyArgs) $ \issue -> + traceWith (consensusSanityCheckTracer rnTraceConsensus) issue (chainDB, finalArgs) <- openChainDB diff --git a/ouroboros-consensus.cabal b/ouroboros-consensus.cabal index a25b7acfc8..2e7dee5a18 100644 --- a/ouroboros-consensus.cabal +++ b/ouroboros-consensus.cabal @@ -815,6 +815,7 @@ test-suite storage-test Test.Ouroboros.Storage.ImmutableDB.StateMachine Test.Ouroboros.Storage.LedgerDB Test.Ouroboros.Storage.LedgerDB.Serialisation + Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck Test.Ouroboros.Storage.LedgerDB.Snapshots Test.Ouroboros.Storage.LedgerDB.StateMachine Test.Ouroboros.Storage.LedgerDB.StateMachine.TestBlock diff --git a/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Block/SupportsSanityCheck.hs b/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Block/SupportsSanityCheck.hs index b11a2eb321..cbdffbe493 100644 --- a/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Block/SupportsSanityCheck.hs +++ b/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Block/SupportsSanityCheck.hs @@ -21,16 +21,52 @@ import Control.Exception import Data.List.NonEmpty (NonEmpty (..)) import qualified Data.List.NonEmpty as NonEmpty import Data.Maybe (catMaybes) +import Data.Time.Clock (DiffTime) +import Data.Word (Word64) import Ouroboros.Consensus.Config (TopLevelConfig) import Ouroboros.Consensus.Config.SecurityParam --- | An issue found in the 'TopLevelConfig' for a block. See 'displayException' +-- | An issue found in the consensus configuration. See 'displayException' -- for human-readable descriptions of each of these cases, especially when -- presenting these to users. data SanityCheckIssue = -- | Configuration contains multiple security parameters. This may cause -- strange behaviour around era boundaries. InconsistentSecurityParam (NonEmpty SecurityParam) + | -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is greater than + -- 'maximumDelay'. The random snapshot delay will be sampled from an + -- inverted range, which is almost certainly a misconfiguration. + SnapshotDelayRangeInverted + -- | The configured minimumDelay (the larger value) + !DiffTime + -- | The configured maximumDelay (the smaller value) + !DiffTime + | -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is negative. + -- A negative delay has no meaningful interpretation. + SnapshotDelayRangeNegativeMinimum + -- | The negative minimumDelay + !DiffTime + | -- | The configured 'sfaRateLimit' is non-positive, which disables snapshot + -- rate limiting entirely. Without a rate limit, snapshots may be taken + -- very frequently during bulk sync, causing excessive disk I/O. + SnapshotRateLimitDisabled + | -- | The configured 'sfaRateLimit' exceeds 24 hours. At steady state, the + -- node may go more than a day between snapshots, significantly increasing + -- replay time after an unclean restart. + SnapshotRateLimitSuspiciouslyLarge + -- | The configured rate limit + !DiffTime + | -- | The configured number of on-disk snapshots to keep is zero. Snapshots + -- will be written to disk and then immediately deleted, leaving nothing + -- for crash recovery. The node will have to replay from genesis on every + -- unclean restart. + SnapshotNumZero + | -- | The configured snapshot interval does not divide 432000 (the Cardano + -- mainnet epoch length in slots). Snapshots will not land on epoch + -- boundaries, breaking Mithril compatibility. + SnapshotIntervalNotDivisorOfEpoch + -- | The configured interval in slots + !Word64 deriving (Show, Eq) instance Exception SanityCheckIssue where @@ -42,6 +78,61 @@ instance Exception SanityCheckIssue where , "eras of a HardForkBlock: " , show (NonEmpty.toList ks) ] + SnapshotDelayRangeInverted mn mx -> + mconcat + [ "SnapshotDelayRangeInverted: " + , "The configured snapshot delay range has minimumDelay (" + , show mn + , ") greater than maximumDelay (" + , show mx + , "). The random snapshot delay will be sampled from an inverted range. " + , "Please ensure minimumDelay <= maximumDelay in sfaDelaySnapshotRange." + ] + SnapshotDelayRangeNegativeMinimum mn -> + mconcat + [ "SnapshotDelayRangeNegativeMinimum: " + , "The configured snapshot delay range has a negative minimumDelay: " + , show mn + , ". A negative delay has no meaningful interpretation. " + , "Please set minimumDelay to a non-negative value in sfaDelaySnapshotRange." + ] + SnapshotRateLimitDisabled -> + mconcat + [ "SnapshotRateLimitDisabled: " + , "The configured sfaRateLimit is non-positive, which disables snapshot " + , "rate limiting entirely. Without a rate limit, snapshots may be taken " + , "very frequently during bulk sync, causing excessive disk I/O. " + , "The default rate limit is 10 minutes." + ] + SnapshotRateLimitSuspiciouslyLarge rl -> + mconcat + [ "SnapshotRateLimitSuspiciouslyLarge: " + , "The configured sfaRateLimit (" + , show rl + , ") exceeds 24 hours. At steady state, the node may go more than a day " + , "between snapshots, significantly increasing replay time after an " + , "unclean restart. The default rate limit is 10 minutes." + ] + SnapshotNumZero -> + mconcat + [ "SnapshotNumZero: " + , "The configured number of on-disk snapshots to keep (spaNum) is 0. " + , "Snapshots will be written to disk and immediately deleted, leaving " + , "nothing for crash recovery. The node will have to replay the chain " + , "from genesis on every unclean restart. " + , "Consider setting spaNum to at least 2 (the default)." + ] + SnapshotIntervalNotDivisorOfEpoch interval -> + mconcat + [ "SnapshotIntervalNotDivisorOfEpoch: " + , "The configured sfaInterval (" + , show interval + , " slots) does not evenly divide the Cardano mainnet epoch length " + , "(432000 slots). Snapshots will not consistently land on epoch " + , "boundaries, which breaks Mithril compatibility. " + , "Consider using an interval that divides 432000 evenly, " + , "such as 4320 (the default, = 2k for k=2160)." + ] -- | 'BlockSupportsSanityCheck' provides evidence that a block can be sanity -- checked for common issues on node startup. 'sanityCheckConfig', which runs diff --git a/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/Snapshots.hs b/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/Snapshots.hs index 595f0929c1..4e9daa14f3 100644 --- a/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/Snapshots.hs +++ b/ouroboros-consensus/src/ouroboros-consensus/Ouroboros/Consensus/Storage/LedgerDB/Snapshots.hs @@ -82,6 +82,8 @@ module Ouroboros.Consensus.Storage.LedgerDB.Snapshots , SnapshotFrequency (..) , SnapshotFrequencyArgs (..) , defaultSnapshotPolicy + , mithrilEpochSize + , sanityCheckSnapshotPolicyArgs , pattern DoDiskSnapshotChecksum , pattern NoDoDiskSnapshotChecksum @@ -710,6 +712,59 @@ defaultSnapshotPolicy (SecurityParam k) args = -- Most relevant during syncing. defRateLimit = secondsToDiffTime $ 10 * 60 +-- | The Cardano mainnet epoch length in slots, used as the reference for the +-- Mithril snapshot compatibility check in 'sanityCheckSnapshotPolicyArgs'. +-- Mithril requires a ledger snapshot at each epoch boundary; for snapshots to +-- land on every epoch boundary, 'sfaInterval' must divide this value evenly. +mithrilEpochSize :: Word64 +mithrilEpochSize = 432000 + +-- | Check a 'SnapshotPolicyArgs' for suspicious configurations and return a +-- (possibly empty) list of 'SanityCheckIssue's describing any problems found. +-- +-- Only 'Override' values are checked — 'UseDefault' values are known-good and +-- are never flagged. Checks that are specific to 'SnapshotFrequency' are +-- skipped entirely when 'spaFrequency' is 'DisableSnapshots'. +sanityCheckSnapshotPolicyArgs :: SnapshotPolicyArgs -> [SanityCheckIssue] +sanityCheckSnapshotPolicyArgs SnapshotPolicyArgs{spaFrequency, spaNum} = + catMaybes $ + checkNumZero spaNum + : case spaFrequency of + DisableSnapshots -> [] + SnapshotFrequency sfa -> checkFrequencyArgs sfa + where + checkNumZero (Override (NumOfDiskSnapshots 0)) = Just SnapshotNumZero + checkNumZero _ = Nothing + + checkFrequencyArgs SnapshotFrequencyArgs{sfaDelaySnapshotRange, sfaRateLimit, sfaInterval} = + [ checkDelayRange sfaDelaySnapshotRange + , checkRateLimitDisabled sfaRateLimit + , checkRateLimitLarge sfaRateLimit + , checkMithrilDivisibility sfaInterval + ] + + checkDelayRange UseDefault = Nothing + checkDelayRange (Override (SnapshotDelayRange mn mx)) + | mn < 0 = Just (SnapshotDelayRangeNegativeMinimum mn) + | mn > mx = Just (SnapshotDelayRangeInverted mn mx) + | otherwise = Nothing + + checkRateLimitDisabled UseDefault = Nothing + checkRateLimitDisabled (Override rl) + | rl <= 0 = Just SnapshotRateLimitDisabled + | otherwise = Nothing + + checkRateLimitLarge UseDefault = Nothing + checkRateLimitLarge (Override rl) + | rl > 86400 = Just (SnapshotRateLimitSuspiciouslyLarge rl) + | otherwise = Nothing + + checkMithrilDivisibility UseDefault = Nothing + checkMithrilDivisibility (Override interval) + | mithrilEpochSize `mod` unNonZero interval /= 0 = + Just (SnapshotIntervalNotDivisorOfEpoch (unNonZero interval)) + | otherwise = Nothing + {------------------------------------------------------------------------------- Tracing snapshot events -------------------------------------------------------------------------------} diff --git a/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB.hs b/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB.hs index 3d876520a5..4b93f632d5 100644 --- a/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB.hs +++ b/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB.hs @@ -6,6 +6,7 @@ module Test.Ouroboros.Storage.LedgerDB (tests) where import qualified Test.Ouroboros.Storage.LedgerDB.Serialisation as Serialisation +import qualified Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck as SnapshotPolicySanityCheck import qualified Test.Ouroboros.Storage.LedgerDB.Snapshots as Snapshots import qualified Test.Ouroboros.Storage.LedgerDB.StateMachine as StateMachine import qualified Test.Ouroboros.Storage.LedgerDB.V1.BackingStore as BackingStore @@ -24,6 +25,7 @@ tests = , -- Independent of the LedgerDB implementation Serialisation.tests , Snapshots.tests + , SnapshotPolicySanityCheck.tests , -- Tests both V1 and V2 StateMachine.tests ] diff --git a/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB/SnapshotPolicySanityCheck.hs b/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB/SnapshotPolicySanityCheck.hs new file mode 100644 index 0000000000..053b0acb03 --- /dev/null +++ b/ouroboros-consensus/test/storage-test/Test/Ouroboros/Storage/LedgerDB/SnapshotPolicySanityCheck.hs @@ -0,0 +1,189 @@ +module Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck (tests) where + +import Cardano.Ledger.BaseTypes (unsafeNonZero) +import Data.Time.Clock (DiffTime, secondsToDiffTime) +import Data.Word (Word64) +import Ouroboros.Consensus.Block.SupportsSanityCheck +import Ouroboros.Consensus.Storage.LedgerDB.Snapshots +import Test.QuickCheck +import Test.Tasty +import Test.Tasty.QuickCheck (testProperty) +import Ouroboros.Consensus.Util.Args (OverrideOrDefault (..)) + +tests :: TestTree +tests = + testGroup + "SnapshotPolicySanityCheck" + [ testProperty "SnapshotNumZero fires iff spaNum overridden to 0" $ + prop_num_zero_iff + , testProperty + "SnapshotDelayRangeInverted fires iff minimumDelay > maximumDelay (given minimumDelay >= 0)" + $ prop_delay_range_inverted_iff + , testProperty "SnapshotDelayRangeNegativeMinimum fires iff minimumDelay < 0" $ + prop_delay_range_negative_minimum_iff + , testProperty "SnapshotRateLimitDisabled fires iff sfaRateLimit overridden to <= 0" $ + prop_rate_limit_disabled_iff + , testProperty + "SnapshotRateLimitSuspiciouslyLarge fires iff sfaRateLimit overridden to > 86400s (given > 0)" + $ prop_rate_limit_large_iff + , testProperty "SnapshotIntervalNotDivisorOfEpoch fires iff 432000 mod interval /= 0" $ + prop_mithril_divisibility_iff + , testProperty "no frequency issues emitted under DisableSnapshots" $ + prop_disable_snapshots_no_frequency_issues + ] + +------------------------------------------------------------------------------- +-- Properties +------------------------------------------------------------------------------- + +-- | 'SnapshotNumZero' fires if and only if 'spaNum' is overridden to 0. +-- Uses 'DisableSnapshots' so that no other checks interfere. +prop_num_zero_iff :: Property +prop_num_zero_iff = + forAll (arbitrary :: Gen Word) $ \n -> + let issues = + sanityCheckSnapshotPolicyArgs + SnapshotPolicyArgs{spaFrequency = DisableSnapshots, spaNum = Override (NumOfDiskSnapshots n)} + in (SnapshotNumZero `elem` issues) === (n == 0) + +-- | 'SnapshotDelayRangeInverted' fires if and only if @minimumDelay > maximumDelay@, +-- when @minimumDelay >= 0@ (avoiding the 'SnapshotDelayRangeNegativeMinimum' case). +prop_delay_range_inverted_iff :: Property +prop_delay_range_inverted_iff = + forAll genNonNegativeDiffTime $ \mn -> + forAll genDiffTime $ \mx -> + let issues = sanityCheckSnapshotPolicyArgs (withDelayRange (SnapshotDelayRange mn mx)) + fired = any isInverted issues + in fired === (mn > mx) + where + isInverted (SnapshotDelayRangeInverted _ _) = True + isInverted _ = False + +-- | 'SnapshotDelayRangeNegativeMinimum' fires if and only if @minimumDelay < 0@. +prop_delay_range_negative_minimum_iff :: Property +prop_delay_range_negative_minimum_iff = + forAll genDiffTime $ \mn -> + let issues = sanityCheckSnapshotPolicyArgs (withDelayRange (SnapshotDelayRange mn 0)) + fired = any isNegativeMin issues + in fired === (mn < 0) + where + isNegativeMin (SnapshotDelayRangeNegativeMinimum _) = True + isNegativeMin _ = False + +-- | 'SnapshotRateLimitDisabled' fires if and only if 'sfaRateLimit' is overridden +-- to a non-positive value. +prop_rate_limit_disabled_iff :: Property +prop_rate_limit_disabled_iff = + forAll genDiffTime $ \rl -> + let issues = sanityCheckSnapshotPolicyArgs (withRateLimit rl) + in (SnapshotRateLimitDisabled `elem` issues) === (rl <= 0) + +-- | 'SnapshotRateLimitSuspiciouslyLarge' fires if and only if 'sfaRateLimit' is +-- overridden to a value greater than 86400s. We restrict to positive rate limits +-- to avoid the 'SnapshotRateLimitDisabled' check interfering. +prop_rate_limit_large_iff :: Property +prop_rate_limit_large_iff = + forAll genPositiveDiffTime $ \rl -> + let issues = sanityCheckSnapshotPolicyArgs (withRateLimit rl) + fired = any isLarge issues + in fired === (rl > 86400) + where + isLarge (SnapshotRateLimitSuspiciouslyLarge _) = True + isLarge _ = False + +-- | 'SnapshotIntervalNotDivisorOfEpoch' fires if and only if +-- @'mithrilEpochSize' \`mod\` interval /= 0@. +prop_mithril_divisibility_iff :: Property +prop_mithril_divisibility_iff = + forAll genNonZeroWord64 $ \n -> + let issues = sanityCheckSnapshotPolicyArgs (withInterval n) + fired = any isMithrilIssue issues + in fired === (mithrilEpochSize `mod` n /= 0) + where + isMithrilIssue (SnapshotIntervalNotDivisorOfEpoch _) = True + isMithrilIssue _ = False + +-- | With 'DisableSnapshots', no frequency-related issues are ever emitted, +-- regardless of what 'spaNum' is set to. +prop_disable_snapshots_no_frequency_issues :: Property +prop_disable_snapshots_no_frequency_issues = + forAll (arbitrary :: Gen Word) $ \n -> + let issues = + sanityCheckSnapshotPolicyArgs + SnapshotPolicyArgs{spaFrequency = DisableSnapshots, spaNum = Override (NumOfDiskSnapshots n)} + in filter isFrequencyIssue issues === [] + where + -- SnapshotNumZero is the only non-frequency issue that can appear here; + -- everything else would require a SnapshotFrequency. + isFrequencyIssue SnapshotNumZero = False + isFrequencyIssue InconsistentSecurityParam{} = False + isFrequencyIssue _ = True + +------------------------------------------------------------------------------- +-- Generators +------------------------------------------------------------------------------- + +-- | Generate an arbitrary 'DiffTime' (integer seconds, possibly negative). +genDiffTime :: Gen DiffTime +genDiffTime = secondsToDiffTime <$> arbitrary + +-- | Generate a non-negative 'DiffTime'. +genNonNegativeDiffTime :: Gen DiffTime +genNonNegativeDiffTime = secondsToDiffTime . abs <$> arbitrary + +-- | Generate a strictly positive 'DiffTime'. +genPositiveDiffTime :: Gen DiffTime +genPositiveDiffTime = secondsToDiffTime . getPositive <$> arbitrary + +-- | Generate a non-zero 'Word64'. +genNonZeroWord64 :: Gen Word64 +genNonZeroWord64 = getPositive <$> arbitrary + +------------------------------------------------------------------------------- +-- Helpers +------------------------------------------------------------------------------- + +-- | Build a 'SnapshotPolicyArgs' with a specific 'SnapshotDelayRange' override. +withDelayRange :: SnapshotDelayRange -> SnapshotPolicyArgs +withDelayRange sdr = + SnapshotPolicyArgs + { spaFrequency = + SnapshotFrequency + SnapshotFrequencyArgs + { sfaInterval = UseDefault + , sfaOffset = UseDefault + , sfaRateLimit = UseDefault + , sfaDelaySnapshotRange = Override sdr + } + , spaNum = UseDefault + } + +-- | Build a 'SnapshotPolicyArgs' with a specific 'sfaRateLimit' override. +withRateLimit :: DiffTime -> SnapshotPolicyArgs +withRateLimit rl = + SnapshotPolicyArgs + { spaFrequency = + SnapshotFrequency + SnapshotFrequencyArgs + { sfaInterval = UseDefault + , sfaOffset = UseDefault + , sfaRateLimit = Override rl + , sfaDelaySnapshotRange = UseDefault + } + , spaNum = UseDefault + } + +-- | Build a 'SnapshotPolicyArgs' with a specific 'sfaInterval' override. +withInterval :: Word64 -> SnapshotPolicyArgs +withInterval n = + SnapshotPolicyArgs + { spaFrequency = + SnapshotFrequency + SnapshotFrequencyArgs + { sfaInterval = Override (unsafeNonZero n) + , sfaOffset = UseDefault + , sfaRateLimit = UseDefault + , sfaDelaySnapshotRange = UseDefault + } + , spaNum = UseDefault + }