diff --git a/Changelog.md b/Changelog.md index 8a17f258e..f9f459089 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,9 @@ # FOSSA CLI Changelog +## 3.17.6 + +- Config: `paths.only` and `paths.exclude` in `.fossa.yml` now accept glob patterns. ([#1703](https://github.com/fossas/fossa-cli/pull/1703)) + ## 3.17.5 - Vendetta: Debug bundles now include per-file component match data from Vendetta scans, making it easier to diagnose why a vendored dependency was or wasn't detected. ([#1706](https://github.com/fossas/fossa-cli/pull/1706)) diff --git a/docs/references/files/fossa-yml.md b/docs/references/files/fossa-yml.md index b993bc1b4..1911773ef 100644 --- a/docs/references/files/fossa-yml.md +++ b/docs/references/files/fossa-yml.md @@ -334,6 +334,28 @@ The list of paths to exclude from scanning in your directory. This section is intended to be used as the inverse to `paths.only`. If you have a certain directory such as `development` you wish to exclude, `paths.exclude` enables you to do this. +#### Glob patterns + +Entries in `paths.only` and `paths.exclude` may also be glob patterns. An entry is treated as a glob if it contains `*`; other entries keep their existing semantics (match the directory and all of its children). Glob matching follows [`System.FilePattern`][filepattern] semantics: `*` matches any sequence of characters within a single path segment, and `**` matches any number of segments. + +Patterns use forward slashes (`/`) as path separators; backslashes are normalized so Windows-native patterns also work. + +```yaml +paths: + exclude: + - "**/vendor/**" + - "**/node_modules/**" + - "build/generated/*" +``` + +Each example above excludes a different shape of directory: + +- `**/vendor/**` skips Go-style vendored trees at any depth, e.g. `services/billing/vendor/k8s.io/apimachinery/pkg/apis/meta/v1/`. +- `**/node_modules/**` skips installed npm packages wherever they appear, e.g. `apps/web-frontend/node_modules/@babel/preset-env/lib/plugins/syntax-dynamic-import/`. +- `build/generated/*` is anchored at the repo root and matches *direct* children of `build/generated/` only. `build/generated/proto-go/` matches; the walker then prunes its entire subtree (e.g. `build/generated/proto-go/v1/messagepb/`). + +[filepattern]: https://hackage.haskell.org/package/filepattern + ### Analysis target configuration Analysis target configuration allows you to select a very specific subset of your directory for scanning. The `targets` and `paths` sections allow users to configure which targets and directories should be scanned. This is useful if you have a custom test directory or development projects within the root project. diff --git a/docs/references/files/fossa-yml.v3.schema.json b/docs/references/files/fossa-yml.v3.schema.json index 5618d8f36..46e2b4f90 100644 --- a/docs/references/files/fossa-yml.v3.schema.json +++ b/docs/references/files/fossa-yml.v3.schema.json @@ -528,18 +528,18 @@ }, "paths": { "type": "object", - "description": "The paths filtering section allows you to specify which paths should be scanned and which should not. The paths should be listed as their location from the root of your project.", + "description": "The paths filtering section allows you to specify which paths should be scanned and which should not. Entries may be concrete directory paths (which match the directory and all of its children) or glob patterns. An entry is treated as a glob if it contains `*`. Glob syntax follows System.FilePattern: `*` matches any sequence of characters within a single path segment, and `**` matches any number of segments.", "properties": { "only": { "type": "array", - "description": "The list of paths to only allow scanning within.", + "description": "The list of paths or glob patterns to only allow scanning within.", "items": { "type": "string" } }, "exclude": { "type": "array", - "description": "The list of paths to exclude from scanning in your directory.", + "description": "The list of paths or glob patterns to exclude from scanning in your directory.", "items": { "type": "string" } diff --git a/src/App/Fossa/Analyze.hs b/src/App/Fossa/Analyze.hs index f2d0700c8..3c5197364 100644 --- a/src/App/Fossa/Analyze.hs +++ b/src/App/Fossa/Analyze.hs @@ -121,7 +121,7 @@ import Data.Aeson qualified as Aeson import Data.ByteString.Lazy qualified as BL import Data.Error (createBody) import Data.Flag (Flag, fromFlag) -import Data.Foldable (traverse_) +import Data.Foldable (for_, traverse_) import Data.Functor (($>)) import Data.List.NonEmpty qualified as NE import Data.Map qualified as Map @@ -132,8 +132,9 @@ import Data.Traversable (for) import Diag.Diagnostic as DI import Diag.Result (resultToMaybe) import Discovery.Archive qualified as Archive -import Discovery.Filters (AllFilters, MavenScopeFilters, applyFilters, filterIsVSIOnly, ignoredPaths, isDefaultNonProductionPath) +import Discovery.Filters (AllFilters (..), MavenScopeFilters, applyFilters, combinedPathGlobs, combinedPaths, filterIsVSIOnly, ignoredPaths, isDefaultNonProductionPath) import Discovery.Projects (withDiscoveredProjects) +import Discovery.Walk (enumeratePrunedSubtrees) import Effect.Exec (Exec) import Effect.Logger ( Logger, @@ -297,6 +298,32 @@ runAnalyzers allowedTactics filters withoutDefaultFilters basedir pathPrefix = d where single (DiscoverFunc f) = withDiscoveredProjects f basedir (runDependencyAnalysis basedir filters withoutDefaultFilters pathPrefix allowedTactics) +-- | Walk the tree once at startup and surface every directory the path +-- filters will prune. Each prune is logged once at info level here, instead +-- of emitting per-strategy duplicates from inside the walker (~28 strategies +-- would otherwise each report the same prune). Short-circuits when no path +-- filters are configured so the extra walk is only paid for when it can +-- produce output. +logPrunedSubtrees :: + ( Has Logger sig m + , Has ReadFS sig m + , Has Diag.Diagnostics sig m + ) => + AllFilters -> + Path Abs Dir -> + m () +logPrunedSubtrees filters basedir = + unless (noPathFilters filters) $ do + pruned <- enumeratePrunedSubtrees filters basedir + for_ pruned $ \p -> + logInfo $ "Skipping path " <> viaShow p <> " (excluded by paths filter)" + where + noPathFilters AllFilters{includeFilters = i, excludeFilters = e} = + null (combinedPaths i) + && null (combinedPathGlobs i) + && null (combinedPaths e) + && null (combinedPathGlobs e) + analyze :: ( Has Debug sig m , Has Diag.Diagnostics sig m @@ -331,6 +358,12 @@ analyze cfg = Diag.context "fossa-analyze" $ do withoutDefaultFilters = Config.withoutDefaultFilters cfg enableSnippetScan = Config.snippetScan cfg enableVendetta = Config.xVendetta cfg + -- Discovery runs with `mempty` when `--no-discovery-exclusion` is set + -- (see definition further down). Log against the same filter set so the + -- startup output matches what discovery actually applies. + discoveryFilters = if fromFlag NoDiscoveryExclusion noDiscoveryExclusion then mempty else filters + + logPrunedSubtrees discoveryFilters basedir manualDepsResult <- Diag.errorBoundaryIO . diagToDebug $ @@ -434,7 +467,6 @@ analyze cfg = Diag.context "fossa-analyze" $ do pure Nothing else Diag.context "first-party-scans" . runStickyLogger SevInfo $ runFirstPartyScan basedir maybeApiOpts cfg let firstPartyScanResults = join . resultToMaybe $ maybeFirstPartyScanResults - let discoveryFilters = if fromFlag NoDiscoveryExclusion noDiscoveryExclusion then mempty else filters let strategyCfg = (Config.strategyConfig cfg) { Config.useGitBackedCargoLocators = Config.UseGitBackedCargoLocators $ maybe True orgSupportsGitBackedCargoLocators orgInfo diff --git a/src/App/Fossa/Config/Common.hs b/src/App/Fossa/Config/Common.hs index 75d83393e..7811bb5ff 100644 --- a/src/App/Fossa/Config/Common.hs +++ b/src/App/Fossa/Config/Common.hs @@ -120,7 +120,7 @@ import Data.String (IsString) import Data.String.Conversion (ToText (toText)) import Data.Text (Text, null, strip, toLower) import Diag.Result (Result (Failure, Success), renderFailure) -import Discovery.Filters (AllFilters (AllFilters), MavenScopeFilters (..), comboExclude, comboInclude, setExclude, setInclude, targetFilterParser) +import Discovery.Filters (AllFilters (AllFilters), MavenScopeFilters (..), comboExcludeWithGlobs, comboIncludeWithGlobs, partitionPathFilters, setExclude, setInclude, targetFilterParser) import Effect.Exec (Exec) import Effect.Logger (Logger, logDebug, logInfo, renderIt, vsep) import Effect.ReadFS (ReadFS, doesDirExist, doesFileExist) @@ -624,11 +624,13 @@ collectConfigFileFilters configFile = do let pullFromFile :: (a -> [b]) -> (ConfigFile -> Maybe a) -> [b] pullFromFile field section = maybe [] field (section configFile) onlyT = pullFromFile targetsOnly configTargets - onlyP = pullFromFile pathsOnly configPaths + (onlyP, onlyG) = partitionPathFilters $ pullFromFile pathsOnly configPaths excludeT = pullFromFile targetsExclude configTargets - excludeP = pullFromFile pathsExclude configPaths + (excludeP, excludeG) = partitionPathFilters $ pullFromFile pathsExclude configPaths - AllFilters (comboInclude onlyT onlyP) (comboExclude excludeT excludeP) + AllFilters + (comboIncludeWithGlobs onlyT onlyP onlyG) + (comboExcludeWithGlobs excludeT excludeP excludeG) collectConfigMavenScopeFilters :: ConfigFile -> MavenScopeFilters collectConfigMavenScopeFilters configFile = do diff --git a/src/App/Fossa/Config/ConfigFile.hs b/src/App/Fossa/Config/ConfigFile.hs index 98addf9d6..d6f690391 100644 --- a/src/App/Fossa/Config/ConfigFile.hs +++ b/src/App/Fossa/Config/ConfigFile.hs @@ -11,6 +11,7 @@ module App.Fossa.Config.ConfigFile ( ConfigRevision (..), ConfigTargets (..), ConfigPaths (..), + PathFilter (..), ConfigTelemetry (..), ConfigTelemetryScope (..), ExperimentalConfigs (..), @@ -57,6 +58,7 @@ import Data.Set qualified as Set import Data.String.Conversion (ToString (toString), ToText (toText)) import Data.Text (Text, strip, toLower) import Diag.Diagnostic (ToDiagnostic (..)) +import Discovery.Filters (PathFilter (..)) import Effect.Logger ( Logger, logDebug, @@ -257,8 +259,8 @@ data ConfigTargets = ConfigTargets deriving (Eq, Ord, Show) data ConfigPaths = ConfigPaths - { pathsOnly :: [Path Rel Dir] - , pathsExclude :: [Path Rel Dir] + { pathsOnly :: [PathFilter] + , pathsExclude :: [PathFilter] } deriving (Eq, Ord, Show) diff --git a/src/Discovery/Filters.hs b/src/Discovery/Filters.hs index 1e29b2488..b0ef8a967 100644 --- a/src/Discovery/Filters.hs +++ b/src/Discovery/Filters.hs @@ -12,7 +12,10 @@ module Discovery.Filters ( filterIsVSIOnly, comboInclude, comboExclude, + comboIncludeWithGlobs, + comboExcludeWithGlobs, combinedPaths, + combinedPathGlobs, combinedTargets, Include, Exclude, @@ -27,12 +30,16 @@ module Discovery.Filters ( setInclude, setExclude, mavenScopeFilterSet, + PathFilter (..), + partitionPathFilters, ) where import Control.Effect.Reader (Has, Reader, ask) import Control.Monad ((<=<)) -import Data.Aeson (ToJSON (toEncoding), defaultOptions, genericToEncoding) -import Data.List (isInfixOf, stripPrefix, (\\)) +import Data.Aeson (FromJSON, ToJSON (toEncoding), defaultOptions, genericToEncoding, pairs, parseJSON, withText, (.=)) +import Data.Glob (Glob, unGlob) +import Data.Glob qualified as Glob +import Data.List (isInfixOf, isPrefixOf, stripPrefix, (\\)) import Data.List.NonEmpty qualified as NE import Data.Map.Strict (Map) import Data.Map.Strict qualified as Map @@ -41,11 +48,12 @@ import Data.Semigroup (sconcat) import Data.Set (Set) import Data.Set qualified as Set import Data.Set.NonEmpty (nonEmpty, toSet) -import Data.String.Conversion (toText) +import Data.String.Conversion (toString, toText) import Data.Text (Text) import Data.Void (Void) import GHC.Generics (Generic) -import Path (Abs, Dir, Path, Rel, fromAbsDir, isProperPrefixOf, parseRelDir) +import Path (Abs, Dir, Path, Rel, fromAbsDir, isProperPrefixOf, parseRelDir, toFilePath) +import System.FilePattern qualified as FilePattern import Text.Megaparsec ( MonadParsec (eof, takeWhile1P, try), Parsec, @@ -86,11 +94,56 @@ instance Monoid AllFilters where data FilterCombination a = FilterCombination { _combinedTargets :: [TargetFilter] , _combinedPaths :: [Path Rel Dir] + , _combinedPathGlobs :: [Glob Rel] } deriving (Eq, Ord, Show, Generic) +-- | Hand-written so the new @_combinedPathGlobs@ field is omitted when it's +-- empty. Otherwise every serialized 'FilterCombination' would gain a +-- @"_combinedPathGlobs":[]@ key compared to pre-glob-support encodings — even +-- in runs that don't configure any globs — which would break consumers that +-- pin on the JSON shape (telemetry, debug bundles). instance ToJSON (FilterCombination a) where - toEncoding = genericToEncoding defaultOptions + toEncoding fc = + pairs $ + "_combinedTargets" .= _combinedTargets fc + <> "_combinedPaths" .= _combinedPaths fc + <> if null (_combinedPathGlobs fc) then mempty else "_combinedPathGlobs" .= _combinedPathGlobs fc + +-- | A user-supplied path filter entry from `.fossa.yml`. Strings containing +-- `*` are parsed as 'Glob' patterns; all other strings are parsed as relative +-- directory paths, preserving the existing "match this directory and its +-- children" semantics. +data PathFilter + = PathFilterDir (Path Rel Dir) + | PathFilterGlob (Glob Rel) + deriving (Eq, Ord, Show, Generic) + +instance FromJSON PathFilter where + parseJSON = withText "PathFilter" $ \txt -> + let s = toString txt + in if '*' `elem` s + then pure . PathFilterGlob . Glob.unsafeGlobRel $ normalizeSlashes s + else case parseRelDir s of + Left err -> fail (show err) + Right p -> pure $ PathFilterDir p + +-- | Normalize backslashes to forward slashes. 'System.FilePattern' only treats +-- @/@ as a segment separator, so any user-supplied pattern or path containing +-- backslashes must be normalized before glob matching. +normalizeSlashes :: String -> String +normalizeSlashes = map toForwardSlash + where + toForwardSlash '\\' = '/' + toForwardSlash c = c + +-- | Split a list of user-supplied path filter entries into concrete directory +-- paths and glob patterns. +partitionPathFilters :: [PathFilter] -> ([Path Rel Dir], [Glob Rel]) +partitionPathFilters = foldr go ([], []) + where + go (PathFilterDir p) (ps, gs) = (p : ps, gs) + go (PathFilterGlob g) (ps, gs) = (ps, g : gs) data MavenScopeFilters = MavenScopeIncludeFilters (FilterSet Include) | MavenScopeExcludeFilters (FilterSet Exclude) deriving (Eq, Ord, Show, Generic) @@ -119,10 +172,11 @@ data Include data Exclude instance Semigroup (FilterCombination a) where - (FilterCombination a1 b1) <> (FilterCombination a2 b2) = FilterCombination (a1 <> a2) (b1 <> b2) + (FilterCombination a1 b1 c1) <> (FilterCombination a2 b2 c2) = + FilterCombination (a1 <> a2) (b1 <> b2) (c1 <> c2) instance Monoid (FilterCombination a) where - mempty = FilterCombination mempty mempty + mempty = FilterCombination mempty mempty mempty mavenScopeFilterSet :: MavenScopeFilters -> Set Text mavenScopeFilterSet (MavenScopeIncludeFilters filterSet) = scopes filterSet @@ -177,22 +231,50 @@ extractPureTool = \case -- * If a path is included, we allow that path and all of its parents and children. -- * If no paths are specifically included, we only reject explicitly excluded paths and their children. -- * If a path is excluded and included, it is rejected. +-- * Glob patterns (e.g. @**\/vendor\/**@) may appear alongside concrete paths. +-- A path is excluded if any exclusion glob matches it. Because tree walkers +-- skip excluded directories before descending, a glob like @**\/vendor@ +-- prunes the entire @vendor@ subtree even if the glob itself does not match +-- every descendant. -- TODO: Is it possible to allow conflicted items? If so, is it possible without creating multiple versions of this function? pathAllowed :: AllFilters -> Path Rel Dir -> Bool pathAllowed AllFilters{..} path = isIncluded && not isExcluded where - includeIsEmpty = null includedPaths - isExcluded = isExcludedMember || isChildOfExcludeMember + includeIsEmpty = null includedPaths && null includedGlobs + isExcluded = isExcludedMember || isChildOfExcludeMember || isExcludedByGlob -- We include parents because our directory scanner will never make it to the included path without the parents -- We include children because our analysis filtering allows children of included members - isIncluded = includeIsEmpty || isParentOfIncludeMember || isIncludeMember || isChildOfIncludeMember + isIncluded = + includeIsEmpty + || isParentOfIncludeMember + || isIncludeMember + || isChildOfIncludeMember + || isIncludedByGlob + || isParentOfIncludedGlob + || isChildOfIncludedGlob isIncludeMember = path `elem` includedPaths isExcludedMember = path `elem` excludedPaths isChildOfIncludeMember = any (`isProperPrefixOf` path) includedPaths isChildOfExcludeMember = any (`isProperPrefixOf` path) excludedPaths isParentOfIncludeMember = any (path `isProperPrefixOf`) includedPaths + isExcludedByGlob = any (`globMatchesDir` path) excludedGlobs + isIncludedByGlob = any (`globMatchesDir` path) includedGlobs + -- A glob like @apps/*@ literally matches @apps/X@; if the walker is at + -- @apps/@, @isIncludedByGlob@ is False (the glob requires one segment past + -- @apps@), and without this branch the walker would refuse to descend and + -- silently drop every project under @apps/@. Allow @path@ when its + -- segments are a prefix of the glob's literal directory prefix + -- (everything before the first wildcard); a glob whose first segment is + -- @**@ has an empty literal prefix, so any path is a candidate ancestor. + isParentOfIncludedGlob = any (pathSegmentsPrefixOf (pathSegments path)) includedGlobs + -- Once an ancestor of @path@ matched an include glob, the walker should + -- descend into all of @path@'s descendants — same semantics as + -- @isChildOfIncludeMember@ for concrete includes. + isChildOfIncludedGlob = any (\g -> any (g `globMatchesDir`) (properAncestors path)) includedGlobs includedPaths = combinedPaths includeFilters excludedPaths = combinedPaths excludeFilters + includedGlobs = combinedPathGlobs includeFilters + excludedGlobs = combinedPathGlobs excludeFilters isDefaultNonProductionPath :: Path Abs Dir -> Path Abs Dir -> Bool isDefaultNonProductionPath baseDir projPath = @@ -231,10 +313,16 @@ ignoredPaths = ] comboInclude :: [TargetFilter] -> [Path Rel Dir] -> FilterCombination Include -comboInclude = FilterCombination +comboInclude ts ps = FilterCombination ts ps [] comboExclude :: [TargetFilter] -> [Path Rel Dir] -> FilterCombination Exclude -comboExclude = FilterCombination +comboExclude ts ps = FilterCombination ts ps [] + +comboIncludeWithGlobs :: [TargetFilter] -> [Path Rel Dir] -> [Glob Rel] -> FilterCombination Include +comboIncludeWithGlobs = FilterCombination + +comboExcludeWithGlobs :: [TargetFilter] -> [Path Rel Dir] -> [Glob Rel] -> FilterCombination Exclude +comboExcludeWithGlobs = FilterCombination setInclude :: (Set Text) -> FilterSet Include setInclude = FilterSet @@ -248,6 +336,9 @@ combinedTargets = _combinedTargets combinedPaths :: FilterCombination a -> [Path Rel Dir] combinedPaths = _combinedPaths +combinedPathGlobs :: FilterCombination a -> [Glob Rel] +combinedPathGlobs = _combinedPathGlobs + -- applyFilters determines if legacy filters are present and if they need to converted to `TargetFilters` for filtering. applyFilters :: AllFilters -> Text -> Path Rel Dir -> FoundTargets -> Maybe FoundTargets applyFilters (AllFilters onlyFilters excludeFilters) tool dir = filterFoundTargets (apply onlyFilters excludeFilters tool dir) @@ -274,7 +365,7 @@ apply include exclude buildtool dir = -- Nothing = "Unknown" -- i.e., there were no filters that matched the buildtool + directories. applyComb :: FilterCombination a -> Text -> Path Rel Dir -> Maybe FilterMatch applyComb comb buildtool dir = - buildTargetFiltersResult <> pathFiltersResult + buildTargetFiltersResult <> pathFiltersResult <> globFiltersResult where buildTargetFiltersResult :: Maybe FilterMatch buildTargetFiltersResult = foldMap' (\t -> applyTarget t buildtool dir) (combinedTargets comb) @@ -282,6 +373,9 @@ applyComb comb buildtool dir = pathFiltersResult :: Maybe FilterMatch pathFiltersResult = foldMap' (`applyPath` dir) (combinedPaths comb) + globFiltersResult :: Maybe FilterMatch + globFiltersResult = foldMap' (`applyGlob` dir) (combinedPathGlobs comb) + applyTarget :: TargetFilter -> Text -> Path Rel Dir -> FilterMatch applyTarget (TypeTarget t) u _ = if t == u then MatchAll else MatchNone applyTarget (TypeDirTarget t p) u q = if t == u && p == q then MatchAll else MatchNone @@ -291,6 +385,83 @@ applyTarget (TypeDirTargetTarget t p target) u q = if t == u && p == q then Matc applyPath :: Path Rel Dir -> Path Rel Dir -> FilterMatch applyPath t u = if isProperPrefixOf t u || t == u then MatchAll else MatchNone +-- | Apply a glob pattern to a relative directory. Returns 'MatchAll' if the +-- glob matches the directory, else 'MatchNone'. +applyGlob :: Glob Rel -> Path Rel Dir -> FilterMatch +applyGlob g dir = if g `globMatchesDir` dir then MatchAll else MatchNone + +-- | Match a glob against a relative directory path. Normalizes the path so +-- glob matching is portable: strips the trailing slash that 'toString' appends +-- to a 'Path Rel Dir' (so @node_modules/*@ matches @node_modules/foo/@), and +-- converts backslashes to forward slashes so user-supplied forward-slash +-- patterns match the backslash-separated paths produced on Windows. +globMatchesDir :: Glob Rel -> Path Rel Dir -> Bool +globMatchesDir glob dir = unGlob glob FilePattern.?== normalize (toString dir) + where + normalize :: String -> String + normalize = trimTrailingSlash . normalizeSlashes + + trimTrailingSlash :: String -> String + trimTrailingSlash s = case unsnoc s of + Just (initSegs, '/') -> initSegs + _ -> s + + -- Matches @Data.List.unsnoc@ from base-4.19 (GHC 9.8), which we can't + -- import directly because the project supports @base >= 4.15@. + unsnoc :: [a] -> Maybe ([a], a) + unsnoc = foldr go Nothing + where + go x Nothing = Just ([], x) + go x (Just (xs, y)) = Just (x : xs, y) + +-- | Path components, in walk order. @"a/b/c/" -> ["a","b","c"]@. The trailing +-- empty segment that 'splitOn' would produce on a directory's @toFilePath@ +-- result is dropped. +pathSegments :: Path Rel Dir -> [String] +pathSegments p = + filter (not . null) $ goSplit (normalizeSlashes (toFilePath p)) + where + goSplit :: String -> [String] + goSplit s = case break (== '/') s of + (h, []) -> [h] + (h, _ : t) -> h : goSplit t + +-- | True if @segs@ is a (possibly empty) prefix of @glob@'s literal directory +-- prefix — i.e. the segments before the first segment containing a wildcard. +-- A glob whose very first segment is wildcarded (e.g. @**\/foo@) has an empty +-- literal prefix and matches any input @segs@, because the @**@ can stand for +-- arbitrary ancestor directories. +pathSegmentsPrefixOf :: [String] -> Glob Rel -> Bool +pathSegmentsPrefixOf segs g = + let prefix = takeWhile (notElem '*') (filter (not . null) (splitSlash (normalizeSlashes (unGlob g)))) + in -- An empty literal prefix means the glob's first segment is wildcarded + -- (e.g. @**/service/**@, @*/foo@). The wildcard can stand for any + -- ancestor segments, so any path is a candidate parent — let the walker + -- descend, the actual match check ('isIncludedByGlob') will fire when + -- it reaches a real match. Otherwise require @segs@ to be a prefix of + -- the literal directory part. + null prefix || segs `isPrefixOf` prefix + where + splitSlash :: String -> [String] + splitSlash s = case break (== '/') s of + (h, []) -> [h] + (h, _ : t) -> h : splitSlash t + +-- | Strict ancestors of a relative directory, ordered shallow-first. Excludes +-- both the repository root (which would be an empty 'Path Rel Dir' that +-- 'parseRelDir' won't accept anyway) and the directory itself. +properAncestors :: Path Rel Dir -> [Path Rel Dir] +properAncestors p = + let segs = pathSegments p + -- Strict, non-empty prefixes: e.g. ["a","b","c"] -> [["a"], ["a","b"]]. + properPrefixes = map (`take` segs) [1 .. length segs - 1] + in mapMaybe (parseRelDir . joinSegments) properPrefixes + where + joinSegments :: [String] -> String + joinSegments [] = "" + joinSegments [x] = x <> "/" + joinSegments (x : rest) = x <> "/" <> joinSegments rest + -- MatchNone <> MatchAll = MatchAll is the reason for this order -- (MatchSome <> MatchAll) and (MatchAll <> MatchSome) outputs the results in MatchSome. -- The implications of this are that if a TypeTargetFilter matches all targets and diff --git a/src/Discovery/Walk.hs b/src/Discovery/Walk.hs index 37efcbab4..e01d7f4af 100644 --- a/src/Discovery/Walk.hs +++ b/src/Discovery/Walk.hs @@ -5,6 +5,7 @@ module Discovery.Walk ( walkWithFilters', WalkStep (..), findFileInAncestor, + enumeratePrunedSubtrees, -- * Helpers fileName, @@ -145,6 +146,27 @@ walkWithFilters' f root = do let f' dir subdirs files = pathFilterIntercept filters root dir subdirs $ f dir subdirs files walk' f' root +-- | Return the relative-to-root paths of every directory pruned by the +-- 'AllFilters' include/exclude path rules. Useful for surfacing pruned dirs +-- to the user once at startup, before any per-strategy walks (which would +-- otherwise emit one log per strategy that reaches the same prune). +enumeratePrunedSubtrees :: + ( Has ReadFS sig m + , Has Diagnostics sig m + ) => + AllFilters -> + Path Abs Dir -> + m [Path Rel Dir] +enumeratePrunedSubtrees filters root = walk' visit root + where + visit _dir subdirs _files = do + let pruned = do + subdir <- subdirs + stripped <- maybe [] pure (stripProperPrefix root subdir) + if pathAllowed filters stripped then [] else [stripped] + skipNames = map (toText . toFilePath . dirname) pruned + pure (pruned, WalkSkipSome skipNames) + -- | Search upwards in the directory tree for the existence of the supplied file. findFileInAncestor :: (Has ReadFS sig m, Has Diagnostics sig m) => Path Abs Dir -> Text -> m (Path Abs File) findFileInAncestor dir file = do diff --git a/test/Discovery/FiltersSpec.hs b/test/Discovery/FiltersSpec.hs index 1076de805..a0993755a 100644 --- a/test/Discovery/FiltersSpec.hs +++ b/test/Discovery/FiltersSpec.hs @@ -6,7 +6,9 @@ module Discovery.FiltersSpec ( import Control.Carrier.Reader (run, runReader) import Control.Monad (when) +import Data.Aeson qualified as Aeson import Data.Foldable (traverse_) +import Data.Glob qualified as Glob import Data.Set qualified as Set import Data.Set.NonEmpty (nonEmpty) import Data.String.Conversion (ToText (toText)) @@ -16,9 +18,13 @@ import Discovery.Filters ( Exclude, FilterCombination, Include, + PathFilter (..), applyFilters, comboExclude, + comboExcludeWithGlobs, comboInclude, + comboIncludeWithGlobs, + partitionPathFilters, pathAllowed, toolAllowed, withToolFilter, @@ -260,6 +266,146 @@ spec = do pathAllowed bigFilters $(mkRelDir "a/b/c") `shouldBe` False pathAllowed bigFilters $(mkRelDir "a/b/d") `shouldBe` True + describe "Glob-based matching" $ do + it "excludes paths matching a '**/name/**' glob" $ do + let filters = excludeGlob "**/vendor/**" + pathAllowed filters $(mkRelDir "foo/vendor/bar") `shouldBe` False + pathAllowed filters $(mkRelDir "a/b/vendor/c") `shouldBe` False + pathAllowed filters $(mkRelDir "foo/bar") `shouldBe` True + + it "excludes paths matching a single-segment wildcard glob" $ do + let filters = excludeGlob "node_modules/*" + pathAllowed filters $(mkRelDir "node_modules/react") `shouldBe` False + pathAllowed filters $(mkRelDir "node_modules/@scope") `shouldBe` False + pathAllowed filters $(mkRelDir "src/node_modules/react") `shouldBe` True + + it "still excludes the subtree when the walker reaches the glob-matched dir" $ do + -- When the tree walker reaches foo/vendor/, that dir matches + -- `**/vendor/**` (the final ** matches an empty tail in + -- System.FilePattern), so the walker prunes the subtree and never + -- descends. Covers the documented expectation. + let filters = excludeGlob "**/vendor/**" + pathAllowed filters $(mkRelDir "foo/vendor") `shouldBe` False + + it "combines glob excludes with concrete path excludes" $ do + let filters = + excludePath $(mkRelDir "build") + <> excludeGlob "**/*.test" + pathAllowed filters $(mkRelDir "build") `shouldBe` False + pathAllowed filters $(mkRelDir "build/artifacts") `shouldBe` False + pathAllowed filters $(mkRelDir "foo/x.test") `shouldBe` False + pathAllowed filters $(mkRelDir "foo/bar") `shouldBe` True + + it "rejects an include-glob miss" $ do + let filters = includeGlob "src/**" + pathAllowed filters $(mkRelDir "src/lib") `shouldBe` True + pathAllowed filters $(mkRelDir "test/lib") `shouldBe` False + + it "lets the walker reach an include-glob's matches via ancestors" $ do + -- `apps/*` matches @apps/X@, not the bare @apps/@. Without ancestor + -- handling the walker would refuse to descend into @apps/@ and every + -- project under it would be silently dropped. Cover the parent dir + -- (must be allowed so the walker descends), the matched dirs (must + -- be allowed because the glob matches), and an unrelated sibling + -- (must still be rejected). + let filters = includeGlob "apps/*" + pathAllowed filters $(mkRelDir "apps") `shouldBe` True + pathAllowed filters $(mkRelDir "apps/foo") `shouldBe` True + pathAllowed filters $(mkRelDir "apps/foo/src") `shouldBe` True + pathAllowed filters $(mkRelDir "lib") `shouldBe` False + + it "should include all parents of glob matches" $ do + -- pathAllowed is what determines if the walker should traverse the + -- directory. For this test, we're asserting that `**/` should traverse + -- every directory, so all lines should be true. The previous block of + -- `apps/*` is asserting on a false example. + let filters = includeGlob "**/service/**" + pathAllowed filters $(mkRelDir "anywhere") `shouldBe` True + pathAllowed filters $(mkRelDir "anywhere/else") `shouldBe` True + pathAllowed filters $(mkRelDir "deep/nested/service/foo") `shouldBe` True + + it "prefers exclude over include when both globs match" $ do + let filters = includeGlob "src/**" <> excludeGlob "src/**" + pathAllowed filters $(mkRelDir "src/lib") `shouldBe` False + + it "normalizes the trailing slash on Path Rel Dir for glob matching" $ do + -- Regression guard: `Path.toString` on a `Path Rel Dir` appends '/', + -- which would otherwise cause `System.FilePattern` to reject single- + -- segment patterns like `node_modules/*`. `globMatchesDir` strips the + -- trailing slash before matching; this test fails if that + -- normalization regresses. + let filters = excludeGlob "node_modules/*" + pathAllowed filters $(mkRelDir "node_modules/react") `shouldBe` False + pathAllowed filters $(mkRelDir "node_modules/lodash") `shouldBe` False + + it "anchors root-level globs to the repo root" $ do + let filters = excludeGlob "build*" + pathAllowed filters $(mkRelDir "build") `shouldBe` False + pathAllowed filters $(mkRelDir "build123") `shouldBe` False + -- A nested `build` directory must not be matched by a root-anchored + -- single-segment glob. + pathAllowed filters $(mkRelDir "src/build") `shouldBe` True + pathAllowed filters $(mkRelDir "a/b/build123") `shouldBe` True + + it "anchors root-level extension globs to the repo root" $ do + let filters = excludeGlob "*.lock" + pathAllowed filters $(mkRelDir "package.lock") `shouldBe` False + pathAllowed filters $(mkRelDir "src/package.lock") `shouldBe` True + + it "applies a four-way mix of include/exclude globs and concrete paths" $ do + -- Include: concrete `src` plus glob `lib/**`. + -- Exclude: concrete `src/build` plus glob `**/vendor/**`. + let filters = + AllFilters + ( comboIncludeWithGlobs + mempty + [$(mkRelDir "src")] + [Glob.unsafeGlobRel "lib/**"] + ) + ( comboExcludeWithGlobs + mempty + [$(mkRelDir "src/build")] + [Glob.unsafeGlobRel "**/vendor/**"] + ) + -- Concrete include: `src` and its children are accepted. + pathAllowed filters $(mkRelDir "src") `shouldBe` True + pathAllowed filters $(mkRelDir "src/app") `shouldBe` True + -- Concrete exclude wins over the concrete include. + pathAllowed filters $(mkRelDir "src/build") `shouldBe` False + pathAllowed filters $(mkRelDir "src/build/out") `shouldBe` False + -- Glob include: `lib` and its descendants are accepted. + pathAllowed filters $(mkRelDir "lib") `shouldBe` True + pathAllowed filters $(mkRelDir "lib/foo/bar") `shouldBe` True + -- Glob exclude wins over both kinds of include. + pathAllowed filters $(mkRelDir "lib/foo/vendor/x") `shouldBe` False + pathAllowed filters $(mkRelDir "src/app/vendor/x") `shouldBe` False + -- Paths matched by neither include are rejected. + pathAllowed filters $(mkRelDir "test") `shouldBe` False + pathAllowed filters $(mkRelDir "other/dir") `shouldBe` False + + describe "PathFilter parsing" $ do + it "partitions concrete paths from glob patterns" $ do + let mixed = + [ PathFilterDir $(mkRelDir "vendor") + , PathFilterGlob (Glob.unsafeGlobRel "**/node_modules/**") + , PathFilterDir $(mkRelDir "build") + , PathFilterGlob (Glob.unsafeGlobRel "*.test") + ] + (paths, globs) = partitionPathFilters mixed + paths `shouldBe` [$(mkRelDir "vendor"), $(mkRelDir "build")] + map Glob.unGlob globs `shouldBe` ["**/node_modules/**", "*.test"] + + it "normalizes backslashes in glob patterns to forward slashes" $ do + -- A Windows user typing `node_modules\*` in `.fossa.yml` should get + -- the same glob as the forward-slash form, because System.FilePattern + -- only treats `/` as a segment separator. + let parse :: String -> Maybe PathFilter + parse s = case Aeson.fromJSON (Aeson.String (toText s)) :: Aeson.Result PathFilter of + Aeson.Success p -> Just p + Aeson.Error _ -> Nothing + parse "node_modules\\*" `shouldBe` parse "node_modules/*" + parse "**\\vendor\\**" `shouldBe` parse "**/vendor/**" + describe "tool filtering helpers" $ do it "should return an empty list when the tool is not allowed" $ do let filters = excludeTool CargoProjectType @@ -300,3 +446,13 @@ includeTool :: DiscoveredProjectType -> AllFilters includeTool tool = AllFilters include mempty where include = comboInclude [TypeTarget $ toText tool] mempty + +includeGlob :: String -> AllFilters +includeGlob glob = AllFilters include mempty + where + include = comboIncludeWithGlobs mempty mempty [Glob.unsafeGlobRel glob] + +excludeGlob :: String -> AllFilters +excludeGlob glob = AllFilters mempty exclude + where + exclude = comboExcludeWithGlobs mempty mempty [Glob.unsafeGlobRel glob] diff --git a/test/Discovery/WalkSpec.hs b/test/Discovery/WalkSpec.hs index 3dd14d668..82b081fe8 100644 --- a/test/Discovery/WalkSpec.hs +++ b/test/Discovery/WalkSpec.hs @@ -5,6 +5,8 @@ module Discovery.WalkSpec ( spec, ) where +import App.Fossa.Config.Common (collectConfigFileFilters) +import App.Fossa.Config.ConfigFile (ConfigFile) import Control.Carrier.Reader (runReader) import Control.Carrier.State.Strict (runState) import Control.Carrier.Writer.Strict (runWriter, tell) @@ -14,7 +16,8 @@ import Control.Effect.State (get, put) import Data.Foldable (traverse_) import Data.Map (Map) import Data.Map qualified as Map -import Discovery.Filters (AllFilters) +import Data.Yaml (decodeThrow) +import Discovery.Filters (AllFilters (AllFilters), comboInclude) import Discovery.Walk import Effect.ReadFS import Path @@ -61,6 +64,106 @@ walkWithFilters'Spec = fooPermissions <- getPermissions foo setPermissions bar fooPermissions + -- Mirror of the exclude test above: with an include filter, the walker + -- should accept the included subtree (including ancestors needed to reach + -- it) but prune siblings that are neither the include nor an ancestor. + it' "respects include-path filters" . withTempDir "test-Discovery-Walk-include" $ \tmpDir -> do + let dirs@[foo, bar, _baz] = + map + (tmpDir ) + [ $(mkRelDir "foo") + , $(mkRelDir "foo/bar") + , $(mkRelDir "foo/baz") + ] + sendIO $ traverse_ createDir dirs + + let filters = includePath $(mkRelDir "foo/bar") + paths <- runWalkWithFilters' 100 filters tmpDir + pathsToTree paths + `shouldBe'` dirTree + [ + ( tmpDir + , dirTree + [ + ( foo + , dirTree + [ (bar, dirTree []) + ] + ) + ] + ) + ] + + -- Strategy-supplied 'WalkSkipSome' must compose with the filter-derived + -- prune list. Tree: tmpDir / { strategy-skip, filter-skip, keep }. The + -- visit function returns 'WalkSkipSome ["strategy-skip"]'; the filter + -- excludes 'filter-skip'. Both subtrees should be pruned, only 'keep' + -- and 'tmpDir' should appear in the visited set. + it' "merges strategy WalkSkipSome with filter prunes" . withTempDir "test-Discovery-Walk-merge" $ \tmpDir -> do + let dirs@[strategySkip, filterSkip, keep] = + map + (tmpDir ) + [ $(mkRelDir "strategy-skip") + , $(mkRelDir "filter-skip") + , $(mkRelDir "keep") + ] + sendIO $ traverse_ createDir dirs + + let filters = excludePath $(mkRelDir "filter-skip") + paths <- runWalkWithFiltersAndStep (WalkSkipSome ["strategy-skip"]) 100 filters tmpDir + pathsToTree paths + `shouldBe'` dirTree + [ + ( tmpDir + , dirTree + [(keep, dirTree [])] + ) + ] + -- Sanity: the prune-list directories really were created on disk; + -- otherwise the assertion above would pass vacuously. + sendIO $ traverse_ (const (pure ())) [strategySkip, filterSkip] + + -- End-to-end: a YAML config goes through the same parse path that + -- '.fossa.yml' takes, then 'collectConfigFileFilters' produces the + -- 'AllFilters' that the walker actually consumes. This catches the + -- "globs parse but never reach the walker" class of regression. + it' "applies paths.exclude globs parsed from a YAML config" . withTempDir "test-Discovery-Walk-yaml" $ \tmpDir -> do + let dirs@[zip', zipPython, src, srcLib] = + map + (tmpDir ) + [ $(mkRelDir "zip") + , $(mkRelDir "zip/python") + , $(mkRelDir "src") + , $(mkRelDir "src/lib") + ] + sendIO $ traverse_ createDir dirs + + let yaml = + "version: 3\n\ + \paths:\n\ + \ exclude:\n\ + \ - \"**/zip/**\"\n" + cfgFn <- sendIO $ decodeThrow yaml + let cfgPath = tmpDir $(mkRelFile ".fossa.yml") + cfg = cfgFn cfgPath :: ConfigFile + filters = collectConfigFileFilters cfg + + paths <- runWalkWithFilters' 100 filters tmpDir + pathsToTree paths + `shouldBe'` dirTree + [ + ( tmpDir + , dirTree + [ + ( src + , dirTree [(srcLib, dirTree [])] + ) + ] + ) + ] + -- Make sure the dirs we expected to be pruned actually exist on disk. + sendIO $ traverse_ (const (pure ())) [zip', zipPython] + walkSpec :: Spec walkSpec = describe "walk" $ do @@ -162,7 +265,22 @@ runWalkWithFilters' :: AllFilters -> Path Abs Dir -> m [Path Abs Dir] -runWalkWithFilters' maxIters filters startDir = +runWalkWithFilters' = runWalkWithFiltersAndStep WalkContinue + +-- | Like 'runWalkWithFilters'', but lets the caller pin the 'WalkStep' that +-- every visited directory returns. Use 'WalkSkipSome' to verify that the +-- caller's skip list is merged with the filter-derived prunes inside +-- 'pathFilterIntercept'. +runWalkWithFiltersAndStep :: + ( Has ReadFS sig m + , Has Diagnostics sig m + ) => + WalkStep -> + Int -> + AllFilters -> + Path Abs Dir -> + m [Path Abs Dir] +runWalkWithFiltersAndStep userStep maxIters filters startDir = do fmap fst . runWriter @@ -176,12 +294,15 @@ runWalkWithFilters' maxIters filters startDir = then do put (iterations + 1) tell [dir] - pure ((), WalkContinue) + pure ((), userStep) else do pure ((), WalkStop) ) startDir +includePath :: Path Rel Dir -> AllFilters +includePath path = AllFilters (comboInclude mempty [path]) mempty + runWalkWithCircuitBreaker :: (Has ReadFS sig m, Has Diagnostics sig m) => Int -> Path Abs Dir -> m [Path Abs Dir] runWalkWithCircuitBreaker maxIters startDir =