fix(gatsby): Chunk nodes when serializing redux to prevent OOM

We are using `v8.serialize` to write and read the redux state. This is faster than `JSON.parse`. Unfortunately, as reported in #17233, this can lead to a fatal when the contents of the redux state is too big to be serialized to a Buffer (hard max of 2GB). Alternatively, we also hit this problem on large site like a million small md pages. The solution is to shard the `nodes` property, which holds all the page data. In this change I've added a simple heuristic to determine the max chunk size (mind you, currently that's basically `Infinity`). It will serialize about 11 individual nodes, measure their size, and based on the biggest node determine how many nodes would fit in 1.5GB. The serialization process is updated to no longer put the `nodes` in the main redux file, but rather sharded over a few specific files. When reading the state from cache, these files are all read and their contents are put together in a single Map again. If there were no nodes files this part does nothing so it's even backwards compatible. Because the write is no longer atomized, the process will now write the redux cache to its own `redux` folder. When writing a new cache it will prepare the new cache in a tmp folder first, then move the existing `redux` folder to a temp location, move the new folder to `redux`, and then try to drop the old folder. This is about as transactional as you can get and should leave the cache in either a stale, empty, or updated state. But never in a partial state.
gatsbyjs · Feb 20, 2020 · 938eca5 · 938eca5
1 parent 19d7cac
commit 938eca5
Show file tree

Hide file tree

Showing 3 changed files with 206 additions and 11 deletions.
diff --git a/packages/gatsby/src/redux/__tests__/index.js b/packages/gatsby/src/redux/__tests__/index.js
@@ -1,4 +1,5 @@
 const _ = require(`lodash`)
+const path = require(`path`)
 
 const writeToCache = jest.spyOn(require(`../persist`), `writeToCache`)
 const { saveState, store, readState } = require(`../index`)
@@ -8,12 +9,40 @@ const {
 } = require(`../actions`)
 
 const mockWrittenContent = new Map()
+const mockCompatiblePath = path
 jest.mock(`fs-extra`, () => {
   return {
     writeFileSync: jest.fn((file, content) =>
       mockWrittenContent.set(file, content)
     ),
     readFileSync: jest.fn(file => mockWrittenContent.get(file)),
+    renameSync: jest.fn((from, to) => {
+      // This will only work for folders if they are always the full prefix
+      // of the file... (that goes for both input dirs). That's the case here.
+      if (mockWrittenContent.has(to)) {
+        throw new Error(`File/folder exists`)
+      }
+
+      // Move all files in this folder as well ... :/
+      mockWrittenContent.forEach((value, key) => {
+        if (key.startsWith(from)) {
+          // rename('foo/bar', 'a/b/c') => foo/bar/ding.js -> a/b/c/ding.js
+          // (.replace with string arg will only replace the first occurrence)
+          mockWrittenContent.set(
+            key.replace(from, to),
+            mockWrittenContent.get(key)
+          )
+          mockWrittenContent.delete(key)
+        }
+      })
+    }),
+    existsSync: jest.fn(target => mockWrittenContent.has(target)),
+    mkdtempSync: jest.fn(suffix => {
+      let dir = mockCompatiblePath.join(`some`, `tmp` + suffix + Math.random())
+      mockWrittenContent.set(dir, Buffer(`empty dir`))
+      return dir
+    }),
+    removeSync: jest.fn(file => mockWrittenContent.delete(file)),
   }
 })
 
@@ -41,11 +70,9 @@ describe(`redux db`, () => {
     mockWrittenContent.clear()
   })
 
-  it(`expect components state to be empty initially`, () => {
+  it(`should write cache to disk`, async () => {
     expect(initialComponentsState).toEqual(new Map())
-  })
 
-  it(`should write cache to disk`, async () => {
     await saveState()
 
     expect(writeToCache).toBeCalled()
@@ -66,4 +93,18 @@ describe(`redux db`, () => {
     // yuck - loki and redux will have different shape of redux state (nodes and nodesByType)
     expect(_.omit(data, [`nodes`, `nodesByType`])).toMatchSnapshot()
   })
+
+  it(`should drop legacy file if exists`, async () => {
+    expect(initialComponentsState).toEqual(new Map())
+
+    const legacyLocation = path.join(process.cwd(), `.cache/redux.state`)
+    mockWrittenContent.set(
+      legacyLocation,
+      Buffer.from(`legacy location for cache`)
+    )
+
+    await saveState()
+
+    expect(mockWrittenContent.has(legacyLocation)).toBe(false)
+  })
 })
diff --git a/packages/gatsby/src/redux/persist.ts b/packages/gatsby/src/redux/persist.ts
@@ -1,12 +1,166 @@
+import path from "path"
 import v8 from "v8"
-import { readFileSync, writeFileSync } from "fs-extra"
-import { ICachedReduxState } from "./types"
+import {
+  existsSync,
+  mkdtempSync,
+  readFileSync,
+  removeSync,
+  renameSync,
+  writeFileSync,
+} from "fs-extra"
+import { IReduxNode, ICachedReduxState } from "./types"
+import { sync as globSync } from "glob"
 
-const file = (): string => `${process.cwd()}/.cache/redux.state`
+const getLegacyCacheFile = (): string =>
+  // TODO: remove this legacy stuff in v3 (fairly benign change but still)
+  // This is a function for the case that somebody does a process.chdir (#19800)
+  path.join(process.cwd(), `.cache/redux.state`)
 
-export const readFromCache = (): ICachedReduxState =>
-  v8.deserialize(readFileSync(file()))
+const getReduxCacheFolder = (): string =>
+  // This is a function for the case that somebody does a process.chdir (#19800)
+  path.join(process.cwd(), `.cache/redux`)
 
-export const writeToCache = (contents: ICachedReduxState): void => {
-  writeFileSync(file(), v8.serialize(contents))
+function reduxRestFile(dir: string): string {
+  return path.join(dir, `redux.rest.state`)
+}
+function reduxChunkFilePrefix(dir: string): string {
+  return path.join(dir, `redux.node.state_`)
+}
+
+function readFromLegacyCache(): ICachedReduxState {
+  return v8.deserialize(readFileSync(getLegacyCacheFile()))
+}
+
+export function readFromCache(): ICachedReduxState {
+  // The cache is stored in two steps; the nodes in chunks and the rest
+  // First we revive the rest, then we inject the nodes into that obj (if any)
+  // Each chunk is stored in its own file, this circumvents max buffer lengths
+  // for sites with a _lot_ of content. Since all nodes go into a Map, the order
+  // of reading them is not relevant.
+
+  const reduxCacheFolder = getReduxCacheFolder()
+
+  if (!existsSync(reduxCacheFolder)) {
+    return readFromLegacyCache()
+  }
+
+  const obj: ICachedReduxState = v8.deserialize(
+    readFileSync(reduxRestFile(reduxCacheFolder))
+  )
+
+  // Note: at 1M pages, this will be 1M/chunkSize chunks (ie. 1m/10k=100)
+  const chunks = globSync(
+    reduxChunkFilePrefix(reduxCacheFolder) + `*`
+  ).map(file => v8.deserialize(readFileSync(file)))
+
+  const nodes: [string, IReduxNode][] = [].concat(...chunks)
+
+  if (chunks.length) {
+    obj.nodes = new Map(nodes)
+  }
+
+  return obj
+}
+
+function guessSafeChunkSize(values: [string, IReduxNode][]): number {
+  // Pick a few random elements and measure their size then pick a chunk size
+  // ceiling based on the worst case. Each test takes time so there's trade-off.
+  // This attempts to prevent small sites with very large pages from OOMing.
+  // This heuristic could still fail if it randomly grabs the smallest nodes.
+  // TODO: test a few nodes per each type instead of from all nodes
+
+  const nodesToTest = 11 // Very arbitrary number
+  const valueCount = values.length
+  const step = Math.floor(valueCount / nodesToTest)
+  let maxSize = 0
+  for (let i = 0; i < valueCount; i += step) {
+    const size = v8.serialize(values[i]).length
+    maxSize = Math.max(size, maxSize)
+  }
+
+  // Max size of a Buffer is 2gb (yeah, we're assuming 64bit system)
+  // https://stackoverflow.com/questions/8974375/whats-the-maximum-size-of-a-node-js-buffer
+  // Use 1.5gb as the target ceiling, allowing for some margin of error
+  return Math.floor((150 * 1024 * 1024 * 1024) / maxSize)
+}
+
+function prepareCacheFolder(
+  targetDir: string,
+  contents: ICachedReduxState
+): void {
+  // Temporarily save the nodes and remove them from the main redux store
+  // This prevents an OOM when the page nodes collectively contain to much data
+  const map = contents.nodes
+  contents.nodes = undefined
+  writeFileSync(reduxRestFile(targetDir), v8.serialize(contents))
+  // Now restore them on the redux store
+  contents.nodes = map
+
+  if (map) {
+    // Now store the nodes separately, chunk size determined by a heuristic
+    const values: [string, IReduxNode][] = [...map.entries()]
+    const chunkSize = guessSafeChunkSize(values)
+    const chunks = Math.ceil(values.length / chunkSize)
+
+    for (let i = 0; i < chunks; ++i) {
+      writeFileSync(
+        reduxChunkFilePrefix(targetDir) + i,
+        v8.serialize(values.slice(i * chunkSize, i * chunkSize + chunkSize))
+      )
+    }
+  }
+}
+
+function safelyRenameToBak(reduxCacheFolder: string): string {
+  // Basically try to work around the potential of previous renamed caches
+  // not being removed for whatever reason. _That_ should not be a blocker.
+  const tmpSuffix = `.bak`
+  let suffixCounter = 0
+  let bakName = reduxCacheFolder + tmpSuffix // Start without number
+  while (existsSync(bakName)) {
+    ++suffixCounter
+    bakName = reduxCacheFolder + tmpSuffix + suffixCounter
+  }
+  renameSync(reduxCacheFolder, bakName)
+
+  return bakName
+}
+
+export function writeToCache(contents: ICachedReduxState): void {
+  // Note: this should be a transactional operation. So work in a tmp dir and
+  // make sure the cache cannot be left in a corruptable state due to errors.
+
+  const tmpDir = mkdtempSync(`reduxcache`) // linux / windows
+
+  prepareCacheFolder(tmpDir, contents)
+
+  // Replace old cache folder with new. If the first rename fails, the cache
+  // is just stale. If the second rename fails, the cache is empty. In either
+  // case the cache is not left in a corrupt state.
+
+  const reduxCacheFolder = getReduxCacheFolder()
+
+  let bakName = ``
+  if (existsSync(reduxCacheFolder)) {
+    // Don't drop until after swapping over (renaming is less likely to fail)
+    bakName = safelyRenameToBak(reduxCacheFolder)
+  }
+
+  // The redux cache folder should now not exist so we can rename our tmp to it
+  renameSync(tmpDir, reduxCacheFolder)
+
+  // Now try to yolorimraf the old cache folder
+  try {
+    const legacy = getLegacyCacheFile()
+    if (existsSync(legacy)) {
+      removeSync(legacy)
+    }
+    if (bakName !== ``) {
+      removeSync(bakName)
+    }
+  } catch (e) {
+    console.warn(
+      `Non-fatal: Deleting the old cache folder failed, left behind in \`${bakName}\`. Rimraf reported this error: ${e}`
+    )
+  }
 }
diff --git a/packages/gatsby/src/redux/types.ts b/packages/gatsby/src/redux/types.ts
@@ -3,7 +3,7 @@ export enum ProgramStatus {
   BOOTSTRAP_QUERY_RUNNING_FINISHED = `BOOTSTRAP_QUERY_RUNNING_FINISHED`,
 }
 
-export type IReduxNode = {
+export interface IReduxNode {
   id: string
   internal: {
     type: string