diff --git a/CHANGELOG.md b/CHANGELOG.md index ca43800d..56dfc142 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fixed issue where certain file and folder names would cause type errors. [#862](https://github.com/sourcebot-dev/sourcebot/pull/862) - Fixed token refresh error "Provider config not found or invalid for: x" when a sso is configured using deprecated env vars. [#841](https://github.com/sourcebot-dev/sourcebot/pull/841) +- Fixed issue where temporary shard files created on index failure were not being cleaned up. [#805](https://github.com/sourcebot-dev/sourcebot/pull/805) ## [4.10.27] - 2026-02-05 diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 6237a034..11fc0b94 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -1,20 +1,19 @@ import * as Sentry from '@sentry/node'; import { PrismaClient, Repo, RepoIndexingJobStatus, RepoIndexingJobType } from "@sourcebot/db"; -import { createLogger, Logger } from "@sourcebot/shared"; -import { env, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema, getRepoPath } from '@sourcebot/shared'; +import { createLogger, env, getRepoPath, Logger, RepoIndexingJobMetadata, repoIndexingJobMetadataSchema, RepoMetadata, repoMetadataSchema } from "@sourcebot/shared"; +import { DelayedError, Job, Queue, Worker } from "bullmq"; import { existsSync } from 'fs'; import { readdir, rm } from 'fs/promises'; -import { DelayedError, Job, Queue, Worker } from "bullmq"; import { Redis } from 'ioredis'; -import Redlock, { ExecutionError } from 'redlock'; import micromatch from 'micromatch'; -import { WORKER_STOP_GRACEFUL_TIMEOUT_MS, INDEX_CACHE_DIR } from './constants.js'; +import Redlock, { ExecutionError } from 'redlock'; +import { INDEX_CACHE_DIR, WORKER_STOP_GRACEFUL_TIMEOUT_MS } from './constants.js'; import { cloneRepository, fetchRepository, getBranches, getCommitHashForRefName, getLatestCommitTimestamp, getLocalDefaultBranch, getTags, isPathAValidGitRepoRoot, unsetGitConfig, upsertGitConfig } from './git.js'; import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; import { getAuthCredentialsForRepo, getShardPrefix, measure, setIntervalAsync } from './utils.js'; -import { indexGitRepository } from './zoekt.js'; +import { cleanupTempShards, indexGitRepository } from './zoekt.js'; const LOG_TAG = 'repo-index-manager'; const logger = createLogger(LOG_TAG); @@ -478,9 +477,17 @@ export class RepoIndexManager { } logger.info(`Indexing ${repo.name} (id: ${repo.id})...`); - const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); - const indexDuration_s = durationMs / 1000; - logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + try { + const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); + const indexDuration_s = durationMs / 1000; + logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + } catch (error) { + // Clean up any temporary shard files left behind by the failed indexing operation. + // Zoekt creates .tmp files during indexing which can accumulate if indexing fails repeatedly. + logger.warn(`Indexing failed for ${repo.name} (id: ${repo.id}), cleaning up temp shard files...`); + await cleanupTempShards(repo); + throw error; + } return revisions; } diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 27f17d71..ff97d1ce 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,6 +1,7 @@ import { Repo } from "@sourcebot/db"; import { createLogger, env, getRepoPath } from "@sourcebot/shared"; import { exec } from "child_process"; +import { readdir, rm } from "fs/promises"; import { INDEX_CACHE_DIR } from "./constants.js"; import { Settings } from "./types.js"; import { getShardPrefix } from "./utils.js"; @@ -54,3 +55,34 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio }) }); } + +/** + * Cleans up temporary shard files left behind by a failed indexing operation. + * Zoekt creates temporary files (with `.tmp` suffix) during indexing, which + * can be left behind if the indexing process fails or is interrupted. + * + * @param repo - The repository whose temp shards should be cleaned up + */ +export const cleanupTempShards = async (repo: Repo) => { + const shardPrefix = getShardPrefix(repo.orgId, repo.id); + + try { + const files = await readdir(INDEX_CACHE_DIR); + const tempFiles = files.filter(file => + file.startsWith(shardPrefix) && file.includes('.tmp') + ); + + for (const file of tempFiles) { + const filePath = `${INDEX_CACHE_DIR}/${file}`; + logger.info(`Cleaning up temp shard file: ${filePath}`); + await rm(filePath, { force: true }); + } + + if (tempFiles.length > 0) { + logger.info(`Cleaned up ${tempFiles.length} temp shard file(s) for repo ${repo.id}`); + } + } catch (error) { + // Log but don't throw - cleanup is best effort + logger.warn(`Failed to cleanup temp shards for repo ${repo.id}:`, error); + } +}