diff --git a/cmd/frontend/internal/httpapi/httpapi.go b/cmd/frontend/internal/httpapi/httpapi.go index 1d7b6f863491..e31c1db418d5 100644 --- a/cmd/frontend/internal/httpapi/httpapi.go +++ b/cmd/frontend/internal/httpapi/httpapi.go @@ -93,6 +93,7 @@ func NewInternalHandler(m *mux.Router) http.Handler { m.Get(apirouter.Telemetry).Handler(trace.TraceRoute(telemetryHandler)) m.Get(apirouter.GraphQL).Handler(trace.TraceRoute(handler(serveGraphQL))) m.Get(apirouter.Configuration).Handler(trace.TraceRoute(handler(serveConfiguration))) + m.Get(apirouter.SearchConfiguration).Handler(trace.TraceRoute(handler(serveSearchConfiguration))) m.Path("/ping").Methods("GET").Name("ping").HandlerFunc(handlePing) m.NotFoundHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index 74249be1e76c..9116cc5e06e7 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -170,6 +170,25 @@ func serveConfiguration(w http.ResponseWriter, r *http.Request) error { return nil } +// serveSearchConfiguration is _only_ used by the zoekt index server. Zoekt does +// not depend on frontend and therefore does not have access to `conf.Watch`. +// Additionally, it only cares about certain search specific settings so this +// search specific endpoint is used rather than serving the entire site settings +// from /.internal/configuration. +func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { + largeFiles := conf.Get().SearchLargeFiles + opts := struct { + LargeFiles []string + }{ + LargeFiles: largeFiles, + } + err := json.NewEncoder(w).Encode(opts) + if err != nil { + return errors.Wrap(err, "encode") + } + return nil +} + func serveReposList(w http.ResponseWriter, r *http.Request) error { var opt db.ReposListOptions err := json.NewDecoder(r.Body).Decode(&opt) diff --git a/cmd/frontend/internal/httpapi/router/router.go b/cmd/frontend/internal/httpapi/router/router.go index 2d8943321cfd..828984b39bd8 100644 --- a/cmd/frontend/internal/httpapi/router/router.go +++ b/cmd/frontend/internal/httpapi/router/router.go @@ -40,6 +40,7 @@ const ( ReposListEnabled = "internal.repos.list-enabled" ReposUpdateMetadata = "internal.repos.update-metadata" Configuration = "internal.configuration" + SearchConfiguration = "internal.search-configuration" ExternalServiceConfigs = "internal.external-services.configs" ExternalServicesList = "internal.external-services.list" ) @@ -104,6 +105,7 @@ func NewInternal(base *mux.Router) *mux.Router { base.Path("/repos/update-metadata").Methods("POST").Name(ReposUpdateMetadata) base.Path("/repos/{RepoName:.*}").Methods("POST").Name(ReposGetByName) base.Path("/configuration").Methods("POST").Name(Configuration) + base.Path("/search/configuration").Methods("GET").Name(SearchConfiguration) addRegistryRoute(base) addGraphQLRoute(base) addTelemetryRoute(base) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 3e2c3a9dc6e3..9cfb32a9a039 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -7,12 +7,16 @@ import ( "context" "crypto/sha256" "encoding/hex" + "fmt" "io" "log" + "path/filepath" + "strings" "sync" "time" "github.com/sourcegraph/sourcegraph/pkg/api" + "github.com/sourcegraph/sourcegraph/pkg/conf" "github.com/sourcegraph/sourcegraph/pkg/diskcache" "github.com/sourcegraph/sourcegraph/pkg/gitserver" "github.com/sourcegraph/sourcegraph/pkg/mutablelimiter" @@ -120,8 +124,10 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. return "", errors.Errorf("commit must be resolved (repo=%q, commit=%q)", repo.Name, commit) } + largeFilePatterns := conf.Get().SearchLargeFiles + // key is a sha256 hash since we want to use it for the disk name - h := sha256.Sum256([]byte(string(repo.Name) + " " + string(commit))) + h := sha256.Sum256([]byte(fmt.Sprintf("%q %q %q", repo.Name, commit, largeFilePatterns))) key := hex.EncodeToString(h[:]) span.LogKV("key", key) @@ -137,7 +143,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. // since we're just going to close it again immediately. bgctx := opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx)) f, err := s.cache.Open(bgctx, key, func(ctx context.Context) (io.ReadCloser, error) { - return s.fetch(ctx, repo, commit) + return s.fetch(ctx, repo, commit, largeFilePatterns) }) var path string if f != nil { @@ -164,7 +170,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. // fetch fetches an archive from the network and stores it on disk. It does // not populate the in-memory cache. You should probably be calling // prepareZip. -func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID) (rc io.ReadCloser, err error) { +func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID, largeFilePatterns []string) (rc io.ReadCloser, err error) { fetchQueueSize.Inc() ctx, releaseFetchLimiter, err := s.fetchLimiter.Acquire(ctx) // Acquire concurrent fetches semaphore if err != nil { @@ -225,7 +231,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi defer r.Close() tr := tar.NewReader(r) zw := zip.NewWriter(pw) - err := copySearchable(tr, zw) + err := copySearchable(tr, zw, largeFilePatterns) if err1 := zw.Close(); err == nil { err = err1 } @@ -239,7 +245,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi // copySearchable copies searchable files from tr to zw. A searchable file is // any file that is a candidate for being searched (under size limit and // non-binary). -func copySearchable(tr *tar.Reader, zw *zip.Writer) error { +func copySearchable(tr *tar.Reader, zw *zip.Writer, largeFilePatterns []string) error { // 32*1024 is the same size used by io.Copy buf := make([]byte, 32*1024) for { @@ -276,8 +282,9 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer) error { return err } - // We do not search the content of large files - if hdr.Size > maxFileSize { + // We do not search the content of large files unless they are + // whitelisted. + if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name, largeFilePatterns) { continue } @@ -337,6 +344,18 @@ func (s *Store) watchAndEvict() { } } +// ignoreSizeMax determines whether the max size should be ignored. It uses +// the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. +func ignoreSizeMax(name string, patterns []string) bool { + for _, pattern := range patterns { + pattern = strings.TrimSpace(pattern) + if m, _ := filepath.Match(pattern, name); m { + return true + } + } + return false +} + var ( cacheSizeBytes = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "searcher", diff --git a/cmd/searcher/search/store_test.go b/cmd/searcher/search/store_test.go index 1587912ccb68..956979effe71 100644 --- a/cmd/searcher/search/store_test.go +++ b/cmd/searcher/search/store_test.go @@ -94,6 +94,37 @@ func TestPrepareZip_fetchTarFail(t *testing.T) { } } +func TestIngoreSizeMax(t *testing.T) { + patterns := []string{ + "foo", + "foo.*", + "foo_*", + "*.foo", + "bar.baz", + } + tests := []struct { + name string + ignored bool + }{ + // Pass + {"foo", true}, + {"foo.bar", true}, + {"foo_bar", true}, + {"bar.baz", true}, + {"bar.foo", true}, + // Fail + {"baz.foo.bar", false}, + {"bar_baz", false}, + {"baz.baz", false}, + } + + for _, test := range tests { + if got, want := ignoreSizeMax(test.name, patterns), test.ignored; got != want { + t.Errorf("case %s got %v want %v", test.name, got, want) + } + } +} + func tmpStore(t *testing.T) (*Store, func()) { d, err := ioutil.TempDir("", "search_test") if err != nil { diff --git a/schema/schema.go b/schema/schema.go index 27114a7fb938..43f1ed713f45 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -432,6 +432,7 @@ type SiteConfiguration struct { ParentSourcegraph *ParentSourcegraph `json:"parentSourcegraph,omitempty"` RepoListUpdateInterval int `json:"repoListUpdateInterval,omitempty"` SearchIndexEnabled *bool `json:"search.index.enabled,omitempty"` + SearchLargeFiles []string `json:"search.largeFiles,omitempty"` } // SlackNotificationsConfig description: Configuration for sending notifications to Slack. diff --git a/schema/site.schema.json b/schema/site.schema.json index fe6a2ddec276..fda4cb5363fa 100644 --- a/schema/site.schema.json +++ b/schema/site.schema.json @@ -22,6 +22,15 @@ "!go": { "pointer": true }, "group": "Search" }, + "search.largeFiles": { + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. The glob pattern syntax can be found here: https://golang.org/pkg/path/filepath/#Match.", + "type": "array", + "items": { + "type": "string" + }, + "group": "Search", + "examples": [["go.sum", "package-lock.json", "*.thrift"]] + }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.", "type": "object", diff --git a/schema/site_stringdata.go b/schema/site_stringdata.go index eea199986a4f..9965d7cdbf76 100644 --- a/schema/site_stringdata.go +++ b/schema/site_stringdata.go @@ -27,6 +27,15 @@ const SiteSchemaJSON = `{ "!go": { "pointer": true }, "group": "Search" }, + "search.largeFiles": { + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. The glob pattern syntax can be found here: https://golang.org/pkg/path/filepath/#Match.", + "type": "array", + "items": { + "type": "string" + }, + "group": "Search", + "examples": [["go.sum", "package-lock.json", "*.thrift"]] + }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.", "type": "object",