dockerTools.streamLayeredImage: Improve layer sharing for large images (#355189)

2026-01-12 02:40:31 +08:00 · 2025-09-07 12:37:35 +02:00
parent d9e82c68f1 ff214e9d0e
commit c158e377e0
6 changed files with 282 additions and 52 deletions
--- a/doc/release-notes/rl-2511.section.md
+++ b/doc/release-notes/rl-2511.section.md
@@ -147,6 +147,8 @@

 - Added `rewriteURL` attribute to the nixpkgs `config`, to allow for rewriting the URLs downloaded by `fetchurl`.

+- The `dockerTools.streamLayeredImage` builder now uses a better algorithm for generating layered docker images, such that much more sharing is possible when the number of store paths exceeds the layer limit. It gives each of the largest store paths its own layer and adds dependencies to those layers when they aren't used elsewhere.
+
 - The systemd initrd will now respect `x-systemd.wants` and `x-systemd.requires` for reliably unlocking multi-disk bcachefs volumes.

 - [`homebox` 0.20.0](https://github.com/sysadminsmedia/homebox/releases/tag/v0.20.0) changed how assets are stored and hashed. It is recommended to back up your database before this update.
--- a/pkgs/build-support/docker/auto-layer.nix
+++ b/pkgs/build-support/docker/auto-layer.nix
@@ -0,0 +1,52 @@
+{
+  jq,
+  lib,
+  python3,
+  runCommand,
+  writeText,
+}:
+
+{
+  closureRoots,
+  excludePaths ? [ ],
+  maxLayers ? 100,
+  fromImage ? null,
+  debug ? false,
+}:
+
+runCommand "layers.json"
+  {
+    __structuredAttrs = true;
+    exportReferencesGraph.graph = closureRoots;
+    inherit fromImage maxLayers;
+    nativeBuildInputs = [
+      jq
+      python3
+    ];
+    excludePathsFile = writeText "excludePaths" (lib.concatMapStrings (x: x + "\n") excludePaths);
+  }
+  ''
+    # Compute the number of layers that are already used by a potential
+    # 'fromImage' as well as the customization layer. Ensure that there is
+    # still at least one layer available to store the image contents.
+    # one layer will be taken up by the customisation layer
+    usedLayers=1
+
+    if [ -n "$fromImage" ]; then
+      # subtract number of base image layers
+      baseImageLayersCount=$(tar -xOf "$fromImage" manifest.json | jq '.[0].Layers | length')
+      (( usedLayers += baseImageLayersCount ))
+    fi
+
+    if ! (( $usedLayers < $maxLayers )); then
+      echo >&2 "Error: usedLayers $usedLayers layers to store 'fromImage' and" \
+                "'extraCommands', but only maxLayers=$maxLayers were" \
+                "allowed. At least 1 layer is required to store contents."
+      exit 1
+    fi
+    availableLayers=$(( maxLayers - usedLayers ))
+
+    jq .graph "$NIX_ATTRS_JSON_FILE" > referencesGraph
+    ${lib.optionalString debug "export DEBUG=1"}
+    python3 ${./auto-layer.py} referencesGraph $excludePathsFile $availableLayers > $out
+  ''
--- a/pkgs/build-support/docker/auto-layer.py
+++ b/pkgs/build-support/docker/auto-layer.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+
+# usage: auto-layer.py graph_file [ignore_file] [layer_limit]
+
+# graph_file: Path to a json file as generated by writeReferencesGraph
+# ignore_file: Path to a file with a list of store paths that should not appear in the output
+# layer_limit: Maximum number of layers to generate, default 100
+
+# This module tries to split a dependency graph of nix store paths into a
+# limited set of layers that together cover all mentioned paths. It tries to
+# choose the layers such that different inputs often have the largest layers in
+# common so most layers can be shared, while the differences in the results end
+# up in smaller layers.
+
+# It does this by splitting off the N largest store paths (by nar size) into
+# their own layers, including some of their dependencies.
+# Specifically, for a large store path L, it creates a layer with L and any
+# store path D that L depends on and for which there is no store path in the
+# input that depends on D but not on L.
+# Then, if there are any store paths that are depended on by multiple of the
+# chosen large store paths, those common dependencies will get their own layer,
+# one per set of large store paths that depends on them.
+# N is iteratively increased until the layer limit is reached.
+
+# The reasoning for this algorithm is as follows:
+
+# Most closures contain a few large store paths and many small store paths. If
+# we want to share as many bytes as possible with other layered images, we
+# should focus on putting the largest paths in their own layer.
+
+# If we had data on how much each store path is used and how likely each
+# combination of store paths is, we might be able to infer which large store
+# paths are better off being combined into a single layer. However, getting that
+# information, let alone keeping it up-to-date is very difficult. If we can't
+# tell that two large store paths are often going to appear together, then we're
+# better off giving each of them their own layer.
+
+# This leaves a lot of smaller store paths to be assigned to layers. Anything
+# that will depend on a large store path L will also depend on all the store
+# paths that L depends on, so it makes sense to move the dependencies of L into
+# the same layer as L.
+
+# Possible improvements:
+# - Specifying a size limit below which the algorithm stops using large store
+#   paths as new layer roots might further improve sharing as the layer
+#   boundaries will depend less on the number of larger store paths in the
+#   input.
+
+import json
+import os
+import sys
+
+def layer_count(layer_split):
+    return len(set(layer_split.values()))
+
+def path_key(path):
+    hash, name = path.split('-', 1)
+    return name, hash
+
+def closure(*todo, key):
+    """
+    Find all dependencies of the arguments including the arguments themselves.
+    """
+    todo = set(todo)
+    done = set()
+    while todo:
+        x = todo.pop()
+        if x not in done:
+            done.add(x)
+            todo.update(key(x))
+    return done
+
+def dependencies(*todo, key):
+    """
+    Find all dependencies of the arguments excluding the arguments themselves.
+    """
+    return closure(*todo, key=key) - set(todo)
+
+def minimal_cover(paths, key):
+    """
+    The minimal set of paths that together cover all input paths with their
+    closure. None of the result paths depend on each other.
+    """
+    paths = set(paths)
+    paths_deps = set.union(*(dependencies(d, key=key) for d in paths))
+    return paths - paths_deps
+
+def auto_layer(graph, ignore_paths, layer_limit):
+    # Compute all direct users of each path
+    nodes = {x["path"]: x | {"users": set()} for x in graph}
+    for user in nodes:
+        for ref in nodes[user]["references"]:
+            nodes[ref]["users"] |= {user}
+
+    def node_deps(path):
+        nonlocal nodes
+        return nodes[path]["references"]
+
+    def node_users(path):
+        nonlocal nodes
+        return nodes[path]["users"]
+
+    nodes_by_size = sorted(graph, key=lambda node: node["narSize"])
+
+    # Here starts the main algorithm:
+    # The goal is to split the set of store paths into layers such that the layers are likely to be
+    # reusable and that the closure size is spread out over the layers. We do this by iteratively taking
+    # the largest store path and giving it its own layer. This primary store path becomes the identity
+    # of the layer. We also add every dependency of the identifying store path to the same layer unless
+    # it is also used by something that doesn't depend on the identifying store path. More generally, we
+    # put store paths together in the same layer when the set of other layers that depend on it is the
+    # same.
+
+    # layer_split defines how the layers are currently split. We start with a single layer with no
+    # dependencies. This is encoded as every store path mapped to the empty set of dependencies.
+    # In general, layer_split maps each store path to the set of primary paths that depend on it and
+    # that set defines and identifies the layer.
+    layer_split = {path: frozenset() for path in nodes}
+
+    primary_paths = set()
+    while nodes_by_size:
+        # Every iteration, we choose the next biggest path to be the root of a new layer.
+        new_primary_path = nodes_by_size.pop()["path"]
+        primary_paths.add(new_primary_path)
+        new_layer_split = layer_split.copy()
+        new_layer_split[new_primary_path] = frozenset({new_primary_path})
+        new_primary_path_deps = dependencies(new_primary_path, key=node_deps)
+        new_primary_path_users = dependencies(new_primary_path, key=node_users)
+
+        # Update the set of primary users for every dependency of the new primary path.
+        for dep in new_primary_path_deps:
+            new_layer_split[dep] -= new_primary_path_users
+            if not new_layer_split[dep] & new_primary_path_deps:
+                new_layer_split[dep] |= {new_primary_path}
+
+        # If we exceed the layer limit, we give up. The previous split should be good enough.
+        if layer_count(new_layer_split) > layer_limit:
+            break
+        layer_split = new_layer_split
+
+    # Main algorithm done, the layers have been chosen.
+    # Now, let's give each layer some metadata, mostly for debugging.
+
+    def layer_info(layer_id):
+        nonlocal nodes
+        nonlocal layer_split
+        # The full set of paths in this layer is all the paths that were assigned to it.
+        paths = {path
+                 for path, layer_id_2 in layer_split.items()
+                 if layer_id == layer_id_2}
+        layerSize = sum(nodes[path]["narSize"] for path in paths)
+        return {
+            "usedBy": sorted(layer_id, key=path_key),
+            "paths": sorted(paths, key=path_key),
+            "layerSize": layerSize,
+            "closureSize": sum(nodes[path]["narSize"] for path in closure(*paths, key=node_deps)),
+        }
+
+    layers = {layer_id: layer_info(layer_id)
+              for layer_id in set(layer_split.values())}
+
+    # The layer order doesn't actually matter for docker but it's still kind of neat to have layers come
+    # after all of their dependencies. The easiest way to do that is to order by closure size since a
+    # layer is necessarily always larger than each of its dependencies since it includes them.
+    layer_order = sorted(layers.values(), key=lambda info: info["closureSize"])
+
+    if os.environ.get("DEBUG"):
+        print(json.dumps(layer_order, indent=2), file=sys.stderr)
+
+    # Sanity check that no store path ends up in multiple layers.
+    total_layer_size = sum(node["layerSize"] for node in layer_order)
+    total_nar_size = sum(node["narSize"] for node in graph)
+    assert total_layer_size == total_nar_size, (total_layer_size, total_nar_size)
+
+    # Format as a list of layers, each defined as a list of store paths.
+    return [[path
+             for path in layer["paths"]
+             if path not in ignore_paths]
+            for layer in layer_order
+            if set(layer["paths"]) - ignore_paths]
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        prog='auto-layer',
+        description='Split store paths into docker layers.'
+    )
+    parser.add_argument('graph_file')
+    parser.add_argument('ignore_file', default="/dev/null")
+    parser.add_argument('layer_limit', type=int, default=100)
+    args = parser.parse_args()
+
+    with open(args.graph_file) as f:
+        graph = json.load(f)
+
+    with open(args.ignore_file) as f:
+        ignore_paths = {line.strip() for line in f}
+
+    print(json.dumps(auto_layer(graph, ignore_paths, args.layer_limit)))
--- a/pkgs/build-support/docker/default.nix
+++ b/pkgs/build-support/docker/default.nix
@@ -1113,24 +1113,32 @@ rec {
        '';
      };

-      layersJsonFile = buildPackages.dockerMakeLayers {
-        inherit debug;
-        closureRoots = optionals includeStorePaths [
-          baseJson
-          customisationLayer
-        ];
-        excludePaths = [
-          baseJson
-          customisationLayer
-        ];
-        pipeline =
-          if layeringPipeline != null then
-            layeringPipeline
-          else
-            import ./popularity-contest-layering-pipeline.nix { inherit lib jq runCommand; } {
-              inherit fromImage maxLayers;
-            };
-      };
+      closureRoots = optionals includeStorePaths [
+        baseJson
+        customisationLayer
+      ];
+
+      excludePaths = [
+        baseJson
+        customisationLayer
+      ];
+
+      layersJsonFile =
+        if layeringPipeline == null then
+          buildPackages.dockerAutoLayer {
+            inherit
+              closureRoots
+              debug
+              excludePaths
+              fromImage
+              maxLayers
+              ;
+          }
+        else
+          buildPackages.dockerMakeLayers {
+            inherit closureRoots debug excludePaths;
+            pipeline = layeringPipeline;
+          };

      conf =
        runCommand "${baseName}-conf.json"
--- a/pkgs/build-support/docker/popularity-contest-layering-pipeline.nix
+++ b/pkgs/build-support/docker/popularity-contest-layering-pipeline.nix
@@ -1,34 +0,0 @@
-{
-  lib,
-  runCommand,
-  jq,
-}:
-{
-  maxLayers,
-  fromImage ? null,
-}:
-runCommand "popularity-contest-layering-pipeline.json" { inherit maxLayers; } ''
-  # Compute the number of layers that are already used by a potential
-  # 'fromImage' as well as the customization layer. Ensure that there is
-  # still at least one layer available to store the image contents.
-  # one layer will be taken up by the customisation layer
-  usedLayers=1
-
-  ${lib.optionalString (fromImage != null) ''
-    # subtract number of base image layers
-    baseImageLayersCount=$(tar -xOf "${fromImage}" manifest.json | ${lib.getExe jq} '.[0].Layers | length')
-
-    (( usedLayers += baseImageLayersCount ))
-  ''}
-
-  if ! (( $usedLayers < $maxLayers )); then
-    echo >&2 "Error: usedLayers $usedLayers layers to store 'fromImage' and" \
-              "'extraCommands', but only maxLayers=$maxLayers were" \
-              "allowed. At least 1 layer is required to store contents."
-    exit 1
-  fi
-  availableLayers=$(( maxLayers - usedLayers ))
-
-  # Produce pipeline which uses popularity_contest algo.
-  echo '[["popularity_contest"],["limit_layers",'$availableLayers']]' > $out
-''
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -859,6 +859,8 @@ with pkgs;

  referencesByPopularity = callPackage ../build-support/references-by-popularity { };

+  dockerAutoLayer = callPackage ../build-support/docker/auto-layer.nix { };
+
  dockerMakeLayers = callPackage ../build-support/docker/make-layers.nix { };

  removeReferencesTo = callPackage ../build-support/remove-references-to {