From ee2fb5255b20a66366a4ff827b4b9a5336298bda Mon Sep 17 00:00:00 2001 From: jdalton Date: Mon, 12 Jan 2026 12:30:03 -0500 Subject: [PATCH] fix: prevent heap overflow in large monorepo scans Add streaming-based filtering to globWithGitIgnore to prevent heap overflow when scanning large monorepos with 100k+ files. Instead of accumulating all file paths and filtering afterwards, files are now filtered during streaming which dramatically reduces memory usage. Changes: - Add `filter` option to globWithGitIgnore for early filtering during streaming - Add createSupportedFilesFilter helper to create filter from supported files - Update getPackageFilesForScan to use streaming filter Fixes SMO-522 Ported from v1.x commit 9bbb8e83 ([SMO-522] Fix heap overflow in large monorepo scans #1026) Co-authored-by: Mikola Lysenko --- CHANGELOG.md | 5 +++ packages/cli/src/utils/fs/glob.mts | 45 +++++++++++++++++----- packages/cli/src/utils/fs/path-resolve.mts | 12 ++++-- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac196b1e2..c42d94a7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [Unreleased] + +### Fixed +- Prevent heap overflow in large monorepo scans by using streaming-based filtering to avoid accumulating all file paths in memory before filtering. + ## [2.1.0](https://github.com/SocketDev/socket-cli/releases/tag/v2.1.0) - 2025-11-02 ### Added diff --git a/packages/cli/src/utils/fs/glob.mts b/packages/cli/src/utils/fs/glob.mts index efdd15139..cc619e3bc 100644 --- a/packages/cli/src/utils/fs/glob.mts +++ b/packages/cli/src/utils/fs/glob.mts @@ -165,6 +165,14 @@ export function filterBySupportedScanFiles( return filepaths.filter(p => micromatch.some(p, patterns, { dot: true })) } +export function createSupportedFilesFilter( + supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'], +): (filepath: string) => boolean { + const patterns = getSupportedFilePatterns(supportedFiles) + return (filepath: string) => + micromatch.some(filepath, patterns, { dot: true }) +} + export function getSupportedFilePatterns( supportedFiles: SocketSdkSuccessResult<'getReportSupportedFiles'>['data'], ): string[] { @@ -179,6 +187,10 @@ export function getSupportedFilePatterns( } type GlobWithGitIgnoreOptions = GlobOptions & { + // Optional filter function to apply during streaming. + // When provided, only files passing this filter are accumulated. + // This is critical for memory efficiency when scanning large monorepos. + filter?: ((filepath: string) => boolean) | undefined socketConfig?: SocketYml | undefined } @@ -188,6 +200,7 @@ export async function globWithGitIgnore( ): Promise { const { cwd = process.cwd(), + filter, socketConfig, ...additionalOptions } = { __proto__: null, ...options } as GlobWithGitIgnoreOptions @@ -244,27 +257,39 @@ export async function globWithGitIgnore( ...additionalOptions, } as GlobOptions - if (!hasNegatedPattern) { + // When no filter is provided and no negated patterns exist, use the fast path. + if (!hasNegatedPattern && !filter) { return await fastGlob.glob(patterns as string[], globOptions) } - // Add support for negated "ignore" patterns which many globbing libraries, // including 'fast-glob', 'globby', and 'tinyglobby', lack support for. - const filtered: string[] = [] - const ig = ignore().add([...ignores]) + // Use streaming to avoid unbounded memory accumulation. + // This is critical for large monorepos with 100k+ files. + const results: string[] = [] + const ig = hasNegatedPattern ? ignore().add([...ignores]) : null const stream = fastGlob.globStream( patterns as string[], globOptions, ) as AsyncIterable for await (const p of stream) { - // Note: the input files must be INSIDE the cwd. If you get strange looking - // relative path errors here, most likely your path is outside the given cwd. - const relPath = globOptions.absolute ? path.relative(cwd, p) : p - if (!ig.ignores(relPath)) { - filtered.push(p) + // Check gitignore patterns with negation support. + if (ig) { + // Note: the input files must be INSIDE the cwd. If you get strange looking + // relative path errors here, most likely your path is outside the given cwd. + const relPath = globOptions.absolute ? path.relative(cwd, p) : p + if (ig.ignores(relPath)) { + continue + } + } + // Apply the optional filter to reduce memory usage. + // When scanning large monorepos, this filters early (e.g., to manifest files only) + // instead of accumulating all 100k+ files and filtering later. + if (filter && !filter(p)) { + continue } + results.push(p) } - return filtered + return results } export async function globWorkspace( diff --git a/packages/cli/src/utils/fs/path-resolve.mts b/packages/cli/src/utils/fs/path-resolve.mts index 1d73b296a..d3c5d3660 100644 --- a/packages/cli/src/utils/fs/path-resolve.mts +++ b/packages/cli/src/utils/fs/path-resolve.mts @@ -7,7 +7,7 @@ import { WIN32 } from '@socketsecurity/lib/constants/platform' import { isDirSync } from '@socketsecurity/lib/fs' import { - filterBySupportedScanFiles, + createSupportedFilesFilter, globWithGitIgnore, pathsToGlobPatterns, } from './glob.mts' @@ -127,13 +127,17 @@ export async function getPackageFilesForScan( ...options, } as PackageFilesForScanOptions - const filepaths = await globWithGitIgnore( + // Apply the supported files filter during streaming to avoid accumulating + // all files in memory. This is critical for large monorepos with 100k+ files + // where accumulating all paths before filtering causes OOM errors. + const filter = createSupportedFilesFilter(supportedFiles) + + return await globWithGitIgnore( pathsToGlobPatterns(inputPaths, options?.cwd), { cwd, + filter, socketConfig, }, ) - - return filterBySupportedScanFiles(filepaths!, supportedFiles) }