Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
### Breaking

- LedgerDB: implemented *predictable* snapshots, i.e. different nodes with the
same configuration will now create snapshots for the same slots.

See 'SnapshotPolicyArgs' for more details.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<!--
A new scriv changelog fragment.

Uncomment the section that is right (remove the HTML comment wrapper).
For top level release notes, leave all the headers commented out.
-->


### Breaking

Introduced a configurable randomised delay before taking ledger state snapshots.

- Renamed `ldbLastSnapshotWrite` to `ldbLastSnapshotRequestedAt`
- Added a `delay` argument to `implTryTakeSnapshot`.
- Renamed `ledgerDbMaintenaceThread` to `ledgerDbMaintenanceThread`

### Non-Breaking

- add `cdbSnapshotDelayRNG` field to `ChainDbEnv`.
- add `cdbsSnapshotDelayRNG` to `ChainDbSpecificArgs`.
- add `onDiskSnapshotDelayRange` to `SnapshotPolicy`.
- add LedgerDB snapshot delay trace events: `SnapshotRequestDelayed` and `SnapshotRequestCompleted`.

<!--
### Patch

- A bullet item for the Patch category.

-->
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ import Ouroboros.Consensus.Util.Orphans ()
import Ouroboros.Network.Block (genesisPoint)
import System.FS.API
import System.IO
import System.Random
import System.Random (genWord64, newStdGen)
import Text.Printf (printf)

{-------------------------------------------------------------------------------
Expand All @@ -78,7 +78,7 @@ openLedgerDB ::
openLedgerDB args =
runWithTempRegistry $
(,()) <$> do
(ldb, _, od) <- case LedgerDB.lgrBackendArgs args of
(ldb, od) <- case LedgerDB.lgrBackendArgs args of
LedgerDB.LedgerDbBackendArgsV1 bss ->
let snapManager = LedgerDB.V1.snapshotManager args
initDb =
Expand Down Expand Up @@ -145,6 +145,7 @@ analyse dbaConfig args =
lsmSalt <- fst . genWord64 <$> newStdGen
ProtocolInfo{pInfoInitLedger = genesisLedger, pInfoConfig = cfg} <-
mkProtocolInfo args
snapshotDelayRng <- newStdGen
let shfs = Node.stdMkChainDbHasFS dbDir
chunkInfo = Node.nodeImmutableDbChunkInfo (configStorage cfg)
flavargs = case ldbBackend of
Expand Down Expand Up @@ -174,6 +175,7 @@ analyse dbaConfig args =
(const True)
shfs
shfs
snapshotDelayRng
flavargs
$ ChainDB.defaultArgs
-- Set @k=1@ to reduce the memory usage of the LedgerDB. We only ever
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ import Ouroboros.Network.Block
import Ouroboros.Network.Point (WithOrigin (..))
import System.Directory
import System.FilePath (takeDirectory, (</>))
import System.Random (newStdGen)

initialize ::
NodeFilePaths ->
Expand Down Expand Up @@ -148,6 +149,7 @@ synthesize ::
IO ForgeResult
synthesize genTxs DBSynthesizerConfig{confOptions, confShelleyGenesis, confDbDir} runP =
withRegistry $ \registry -> do
snapshotDelayRng <- newStdGen
let
epochSize = sgEpochLength confShelleyGenesis
chunkInfo = Node.nodeImmutableDbChunkInfo (configStorage pInfoConfig)
Expand All @@ -161,6 +163,7 @@ synthesize genTxs DBSynthesizerConfig{confOptions, confShelleyGenesis, confDbDir
(const True)
(Node.stdMkChainDbHasFS confDbDir)
(Node.stdMkChainDbHasFS confDbDir)
snapshotDelayRng
flavargs
$ ChainDB.defaultArgs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,10 @@ runWith RunNodeArgs{..} encAddrNtN decAddrNtN LowLevelRunNodeArgs{..} =

forM_ (sanityCheckConfig cfg) $ \issue ->
traceWith (consensusSanityCheckTracer rnTraceConsensus) issue
let snapshotPolicyArgs =
lgrSnapshotPolicyArgs $ ChainDB.cdbLgrDbArgs llrnChainDbArgsDefaults
forM_ (sanityCheckSnapshotPolicyArgs snapshotPolicyArgs) $ \issue ->
traceWith (consensusSanityCheckTracer rnTraceConsensus) issue

(chainDB, finalArgs) <-
openChainDB
Expand All @@ -533,6 +537,7 @@ runWith RunNodeArgs{..} encAddrNtN decAddrNtN LowLevelRunNodeArgs{..} =
initLedger
llrnMkImmutableHasFS
llrnMkVolatileHasFS
snapshotDelayRng
llrnLdbFlavorArgs
llrnChainDbArgsDefaults
( setLoEinChainDbArgs
Expand Down Expand Up @@ -647,7 +652,8 @@ runWith RunNodeArgs{..} encAddrNtN decAddrNtN LowLevelRunNodeArgs{..} =
where
(gsmAntiThunderingHerd, rng') = splitGen llrnRng
(peerSelectionRng, rng'') = splitGen rng'
(keepAliveRng, ntnAppsRng) = splitGen rng''
(keepAliveRng, rng''') = splitGen rng''
(ntnAppsRng, snapshotDelayRng) = splitGen rng'''

ProtocolInfo
{ pInfoConfig = cfg
Expand Down Expand Up @@ -840,13 +846,15 @@ openChainDB ::
(ChainDB.RelativeMountPoint -> SomeHasFS m) ->
-- | Volatile FS, see 'NodeDatabasePaths'
(ChainDB.RelativeMountPoint -> SomeHasFS m) ->
StdGen ->
-- | RNG used to randomise snapshot delays
LedgerDbBackendArgs m blk ->
-- | A set of default arguments (possibly modified from 'defaultArgs')
Incomplete ChainDbArgs m blk ->
-- | Customise the 'ChainDbArgs'
(Complete ChainDbArgs m blk -> Complete ChainDbArgs m blk) ->
m (ChainDB m blk, Complete ChainDbArgs m blk)
openChainDB registry cfg initLedger fsImm fsVol flavorArgs defArgs customiseArgs =
openChainDB registry cfg initLedger fsImm fsVol delayRng flavorArgs defArgs customiseArgs =
let args =
customiseArgs $
ChainDB.completeChainDbArgs
Expand All @@ -857,6 +865,7 @@ openChainDB registry cfg initLedger fsImm fsVol flavorArgs defArgs customiseArgs
(nodeCheckIntegrity (configStorage cfg))
fsImm
fsVol
delayRng
flavorArgs
defArgs
in (,args) <$> ChainDB.openDB args
Expand Down
7 changes: 5 additions & 2 deletions ouroboros-consensus.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ library
primitive,
psqueues ^>=0.2.3,
quiet ^>=0.2,
random,
rawlock ^>=0.1.1,
resource-registry ^>=0.2,
semialign >=1.1,
Expand Down Expand Up @@ -794,6 +795,7 @@ test-suite storage-test
Test.Ouroboros.Storage.ChainDB.FollowerPromptness
Test.Ouroboros.Storage.ChainDB.GcSchedule
Test.Ouroboros.Storage.ChainDB.Iterator
Test.Ouroboros.Storage.ChainDB.LedgerSnapshots
Test.Ouroboros.Storage.ChainDB.Model
Test.Ouroboros.Storage.ChainDB.Model.Test
Test.Ouroboros.Storage.ChainDB.Paths
Expand All @@ -808,7 +810,7 @@ test-suite storage-test
Test.Ouroboros.Storage.ImmutableDB.StateMachine
Test.Ouroboros.Storage.LedgerDB
Test.Ouroboros.Storage.LedgerDB.Serialisation
Test.Ouroboros.Storage.LedgerDB.SnapshotPolicy
Test.Ouroboros.Storage.LedgerDB.SnapshotPolicySanityCheck
Test.Ouroboros.Storage.LedgerDB.Snapshots
Test.Ouroboros.Storage.LedgerDB.StateMachine
Test.Ouroboros.Storage.LedgerDB.StateMachine.TestBlock
Expand All @@ -834,11 +836,12 @@ test-suite storage-test
aeson,
base,
bifunctors,
blockio:sim,
bytestring,
cardano-binary,
cardano-crypto-class,
cardano-ledger-binary:testlib,
cardano-ledger-core:{cardano-ledger-core, testlib},
cardano-ledger-core:cardano-ledger-core,
cardano-slotting:{cardano-slotting, testlib},
cardano-strict-containers,
cborg,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,52 @@ import Control.Exception
import Data.List.NonEmpty (NonEmpty (..))
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (catMaybes)
import Data.Time.Clock (DiffTime)
import Data.Word (Word64)
import Ouroboros.Consensus.Config (TopLevelConfig)
import Ouroboros.Consensus.Config.SecurityParam

-- | An issue found in the 'TopLevelConfig' for a block. See 'displayException'
-- | An issue found in the consensus configuration. See 'displayException'
-- for human-readable descriptions of each of these cases, especially when
-- presenting these to users.
data SanityCheckIssue
= -- | Configuration contains multiple security parameters. This may cause
-- strange behaviour around era boundaries.
InconsistentSecurityParam (NonEmpty SecurityParam)
| -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is greater than
-- 'maximumDelay'. The random snapshot delay will be sampled from an
-- inverted range, which is almost certainly a misconfiguration.
SnapshotDelayRangeInverted
-- | The configured minimumDelay (the larger value)
!DiffTime
-- | The configured maximumDelay (the smaller value)
!DiffTime
| -- | The configured 'minimumDelay' in 'SnapshotDelayRange' is negative.
-- A negative delay has no meaningful interpretation.
SnapshotDelayRangeNegativeMinimum
-- | The negative minimumDelay
!DiffTime
| -- | The configured 'sfaRateLimit' is non-positive, which disables snapshot
-- rate limiting entirely. Without a rate limit, snapshots may be taken
-- very frequently during bulk sync, causing excessive disk I/O.
SnapshotRateLimitDisabled
| -- | The configured 'sfaRateLimit' exceeds 24 hours. At steady state, the
-- node may go more than a day between snapshots, significantly increasing
-- replay time after an unclean restart.
SnapshotRateLimitSuspiciouslyLarge
-- | The configured rate limit
!DiffTime
| -- | The configured number of on-disk snapshots to keep is zero. Snapshots
-- will be written to disk and then immediately deleted, leaving nothing
-- for crash recovery. The node will have to replay from genesis on every
-- unclean restart.
SnapshotNumZero
| -- | The configured snapshot interval does not divide 432000 (the Cardano
-- mainnet epoch length in slots). Snapshots will not land on epoch
-- boundaries, breaking Mithril compatibility.
SnapshotIntervalNotDivisorOfEpoch
-- | The configured interval in slots
!Word64
deriving (Show, Eq)

instance Exception SanityCheckIssue where
Expand All @@ -42,6 +78,61 @@ instance Exception SanityCheckIssue where
, "eras of a HardForkBlock: "
, show (NonEmpty.toList ks)
]
SnapshotDelayRangeInverted mn mx ->
mconcat
[ "SnapshotDelayRangeInverted: "
, "The configured snapshot delay range has minimumDelay ("
, show mn
, ") greater than maximumDelay ("
, show mx
, "). The random snapshot delay will be sampled from an inverted range. "
, "Please ensure minimumDelay <= maximumDelay in sfaDelaySnapshotRange."
]
SnapshotDelayRangeNegativeMinimum mn ->
mconcat
[ "SnapshotDelayRangeNegativeMinimum: "
, "The configured snapshot delay range has a negative minimumDelay: "
, show mn
, ". A negative delay has no meaningful interpretation. "
, "Please set minimumDelay to a non-negative value in sfaDelaySnapshotRange."
]
SnapshotRateLimitDisabled ->
mconcat
[ "SnapshotRateLimitDisabled: "
, "The configured sfaRateLimit is non-positive, which disables snapshot "
, "rate limiting entirely. Without a rate limit, snapshots may be taken "
, "very frequently during bulk sync, causing excessive disk I/O. "
, "The default rate limit is 10 minutes."
]
SnapshotRateLimitSuspiciouslyLarge rl ->
mconcat
[ "SnapshotRateLimitSuspiciouslyLarge: "
, "The configured sfaRateLimit ("
, show rl
, ") exceeds 24 hours. At steady state, the node may go more than a day "
, "between snapshots, significantly increasing replay time after an "
, "unclean restart. The default rate limit is 10 minutes."
]
SnapshotNumZero ->
mconcat
[ "SnapshotNumZero: "
, "The configured number of on-disk snapshots to keep (spaNum) is 0. "
, "Snapshots will be written to disk and immediately deleted, leaving "
, "nothing for crash recovery. The node will have to replay the chain "
, "from genesis on every unclean restart. "
, "Consider setting spaNum to at least 2 (the default)."
]
SnapshotIntervalNotDivisorOfEpoch interval ->
mconcat
[ "SnapshotIntervalNotDivisorOfEpoch: "
, "The configured sfaInterval ("
, show interval
, " slots) does not evenly divide the Cardano mainnet epoch length "
, "(432000 slots). Snapshots will not consistently land on epoch "
, "boundaries, which breaks Mithril compatibility. "
, "Consider using an interval that divides 432000 evenly, "
, "such as 4320 (the default, = 2k for k=2160)."
]

-- | 'BlockSupportsSanityCheck' provides evidence that a block can be sanity
-- checked for common issues on node startup. 'sanityCheckConfig', which runs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
-- want to track that one to close it on exception. See "Resource management
-- in the LedgerDB" in "Ouroboros.Consensus.Storage.LedgerDB.API" for an
-- explanation of why.
(lgrDB, replayed) <-
lgrDB <-
LedgerDB.openDB
argsLgrDb
(ImmutableDB.streamAPI immutableDB)
Expand Down Expand Up @@ -241,6 +241,7 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
chainSelFuse <- newFuse "chain selection"
chainSelQueue <- newChainSelQueue (Args.cdbsBlocksToAddSize cdbSpecificArgs)
varChainSelStarvation <- newTVarIO ChainSelStarvationOngoing
varSnapshotDelayRNG <- newTVarIO (Args.cdbsSnapshotDelayRNG cdbSpecificArgs)

let env =
CDB
Expand All @@ -267,6 +268,7 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
, cdbLoE = Args.cdbsLoE cdbSpecificArgs
, cdbChainSelStarvation = varChainSelStarvation
, cdbPerasCertDB = perasCertDB
, cdbSnapshotDelayRNG = varSnapshotDelayRNG
}

setGetCurrentChainForLedgerDB $ Query.getCurrentChain env
Expand Down Expand Up @@ -312,13 +314,7 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
, intGarbageCollect = \slot -> getEnv h $ \e -> do
Background.garbageCollectBlocks e slot
LedgerDB.garbageCollect (cdbLedgerDB e) slot
, intTryTakeSnapshot = getEnv h $ \env' ->
void $
LedgerDB.tryTakeSnapshot
(cdbLedgerDB env')
(void $ Background.copyToImmutableDB env')
Nothing
maxBound
, intTryTakeSnapshot = getEnv2 h $ LedgerDB.tryTakeSnapshot . cdbLedgerDB
, intAddBlockRunner = getEnv h (Background.addBlockRunner addBlockTestFuse)
, intKillBgThreads = varKillBgThreads
}
Expand All @@ -329,7 +325,7 @@ openDBInternal args launchBgTasks = runWithTempRegistry $ do
(castPoint $ AF.anchorPoint chain)
(castPoint $ AF.headPoint chain)

when launchBgTasks $ Background.launchBgTasks env replayed
when launchBgTasks $ Background.launchBgTasks env

-- Note we put the ChainDB in the top level registry before exiting the
-- 'runWithTempRegistry' scope. This way, the critical resources (actually
Expand Down
Loading