From 25ac54d15daa57c3bc115a71de2fca1468eb3df5 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Tue, 9 Apr 2019 18:36:52 -0700 Subject: [PATCH 01/15] add configuration option for searching large files --- schema/schema.go | 1 + schema/site.schema.json | 9 +++++++++ schema/site_stringdata.go | 9 +++++++++ 3 files changed, 19 insertions(+) diff --git a/schema/schema.go b/schema/schema.go index 27114a7fb938..0e22dad743d8 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -432,6 +432,7 @@ type SiteConfiguration struct { ParentSourcegraph *ParentSourcegraph `json:"parentSourcegraph,omitempty"` RepoListUpdateInterval int `json:"repoListUpdateInterval,omitempty"` SearchIndexEnabled *bool `json:"search.index.enabled,omitempty"` + SearchLargeFiles *[]string `json:"search.largeFiles,omitempty"` } // SlackNotificationsConfig description: Configuration for sending notifications to Slack. diff --git a/schema/site.schema.json b/schema/site.schema.json index fe6a2ddec276..d8b7e91615f0 100644 --- a/schema/site.schema.json +++ b/schema/site.schema.json @@ -22,6 +22,15 @@ "!go": { "pointer": true }, "group": "Search" }, + "search.largeFiles": { + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size.", + "type": "array", + "items": { + "type": "string" + }, + "!go": { "pointer": true }, + "group": "Search" + }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.", "type": "object", diff --git a/schema/site_stringdata.go b/schema/site_stringdata.go index eea199986a4f..f80065fc9e12 100644 --- a/schema/site_stringdata.go +++ b/schema/site_stringdata.go @@ -27,6 +27,15 @@ const SiteSchemaJSON = `{ "!go": { "pointer": true }, "group": "Search" }, + "search.largeFiles": { + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size.", + "type": "array", + "items": { + "type": "string" + }, + "!go": { "pointer": true }, + "group": "Search" + }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.", "type": "object", From b0d6a7cdc84b8c24b1764d791e7ad23119d98e1e Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Tue, 9 Apr 2019 18:37:13 -0700 Subject: [PATCH 02/15] add internal endpoint that serves search options --- cmd/frontend/internal/httpapi/httpapi.go | 1 + cmd/frontend/internal/httpapi/internal.go | 16 ++++++++++++++++ cmd/frontend/internal/httpapi/router/router.go | 2 ++ 3 files changed, 19 insertions(+) diff --git a/cmd/frontend/internal/httpapi/httpapi.go b/cmd/frontend/internal/httpapi/httpapi.go index 1d7b6f863491..e31c1db418d5 100644 --- a/cmd/frontend/internal/httpapi/httpapi.go +++ b/cmd/frontend/internal/httpapi/httpapi.go @@ -93,6 +93,7 @@ func NewInternalHandler(m *mux.Router) http.Handler { m.Get(apirouter.Telemetry).Handler(trace.TraceRoute(telemetryHandler)) m.Get(apirouter.GraphQL).Handler(trace.TraceRoute(handler(serveGraphQL))) m.Get(apirouter.Configuration).Handler(trace.TraceRoute(handler(serveConfiguration))) + m.Get(apirouter.SearchConfiguration).Handler(trace.TraceRoute(handler(serveSearchConfiguration))) m.Path("/ping").Methods("GET").Name("ping").HandlerFunc(handlePing) m.NotFoundHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index 74249be1e76c..e214878d1f76 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -170,6 +170,22 @@ func serveConfiguration(w http.ResponseWriter, r *http.Request) error { return nil } +type searchOptions struct { + LargeFiles *[]string +} + +func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { + largeFiles := conf.Get().SearchLargeFiles + opts := searchOptions{ + LargeFiles: largeFiles, + } + err := json.NewEncoder(w).Encode(opts) + if err != nil { + return errors.Wrap(err, "Encode") + } + return nil +} + func serveReposList(w http.ResponseWriter, r *http.Request) error { var opt db.ReposListOptions err := json.NewDecoder(r.Body).Decode(&opt) diff --git a/cmd/frontend/internal/httpapi/router/router.go b/cmd/frontend/internal/httpapi/router/router.go index 2d8943321cfd..828984b39bd8 100644 --- a/cmd/frontend/internal/httpapi/router/router.go +++ b/cmd/frontend/internal/httpapi/router/router.go @@ -40,6 +40,7 @@ const ( ReposListEnabled = "internal.repos.list-enabled" ReposUpdateMetadata = "internal.repos.update-metadata" Configuration = "internal.configuration" + SearchConfiguration = "internal.search-configuration" ExternalServiceConfigs = "internal.external-services.configs" ExternalServicesList = "internal.external-services.list" ) @@ -104,6 +105,7 @@ func NewInternal(base *mux.Router) *mux.Router { base.Path("/repos/update-metadata").Methods("POST").Name(ReposUpdateMetadata) base.Path("/repos/{RepoName:.*}").Methods("POST").Name(ReposGetByName) base.Path("/configuration").Methods("POST").Name(Configuration) + base.Path("/search/configuration").Methods("GET").Name(SearchConfiguration) addRegistryRoute(base) addGraphQLRoute(base) addTelemetryRoute(base) From 47c31fc496cc04563eedf06cfb713c34298800e9 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Wed, 10 Apr 2019 16:04:05 -0700 Subject: [PATCH 03/15] wip: impl in searcher --- cmd/searcher/search/store.go | 77 ++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 3e2c3a9dc6e3..4523049de6f3 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -9,13 +9,17 @@ import ( "encoding/hex" "io" "log" + "path/filepath" + "strings" "sync" "time" "github.com/sourcegraph/sourcegraph/pkg/api" + "github.com/sourcegraph/sourcegraph/pkg/conf" "github.com/sourcegraph/sourcegraph/pkg/diskcache" "github.com/sourcegraph/sourcegraph/pkg/gitserver" "github.com/sourcegraph/sourcegraph/pkg/mutablelimiter" + log15 "gopkg.in/inconshreveable/log15.v2" opentracing "github.com/opentracing/opentracing-go" "github.com/opentracing/opentracing-go/ext" @@ -65,6 +69,10 @@ type Store struct { // zipCache provides efficient access to repo zip files. zipCache zipCache + + // largeFilesPatterns is a list of large file glob patterns where files that + // match any pattern in the list should be searched regardless of their size. + largeFilePatterns []string } // SetMaxConcurrentFetchTar sets the maximum number of concurrent calls allowed @@ -225,7 +233,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi defer r.Close() tr := tar.NewReader(r) zw := zip.NewWriter(pw) - err := copySearchable(tr, zw) + err := copySearchable(tr, zw, s.ignoreSizeMax) if err1 := zw.Close(); err == nil { err = err1 } @@ -239,7 +247,8 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi // copySearchable copies searchable files from tr to zw. A searchable file is // any file that is a candidate for being searched (under size limit and // non-binary). -func copySearchable(tr *tar.Reader, zw *zip.Writer) error { +func copySearchable(tr *tar.Reader, zw *zip.Writer, ignoreSizeMax func(string) bool) error { + log15.Info("copy searchable") // 32*1024 is the same size used by io.Copy buf := make([]byte, 32*1024) for { @@ -276,8 +285,11 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer) error { return err } - // We do not search the content of large files - if hdr.Size > maxFileSize { + _ = ignoreSizeMax(hdr.Name) + + // We do not search the content of large files unless they are + // whitelisted. + if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name) { continue } @@ -337,6 +349,63 @@ func (s *Store) watchAndEvict() { } } +// ignoreSizeMax determines whether the max size should be ignored. It uses +// the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. +func (s *Store) ignoreSizeMax(name string) bool { + log15.Info(">>>>>", "name", name) + for _, pattern := range s.largeFilePatterns { + pattern = strings.TrimSpace(pattern) + if m, _ := filepath.Match(pattern, name); m { + return true + } + } + return false +} + +func getLargeFilePatterns() []string { + lfp := conf.Get().SearchLargeFiles + if lfp == nil { + return []string{} + } + return *lfp +} + +// stringSlicesAreEqual checks to make sure that two string slices have the same +// items. +func stringSlicesAreEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + aMap := map[string]struct{}{} + for i := range a { + aMap[a[i]] = struct{}{} + } + for i := range b { + if _, ok := aMap[b[i]]; !ok { + return false + } + } + return true +} + +// watchLargeFilesSettingChange watches for changes to the search.largeFiles +// setting and clears the disk cache when it is changed. +func (s *Store) watchLargeFilesChange() { + if s.largeFilePatterns == nil { + s.largeFilePatterns = getLargeFilePatterns() + } + + conf.Watch(func() { + // Ensure the slices are actually different so we don't blow away the + // cache needlessly. + log15.Info("config changed", "old", s.largeFilePatterns, "new", getLargeFilePatterns()) + if lfp := getLargeFilePatterns(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { + s.largeFilePatterns = lfp + s.cache.Evict(0) + } + }) +} + var ( cacheSizeBytes = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "searcher", From 3db5d6d49e8d8d200df224219b0e9bd150da38ff Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 11 Apr 2019 11:13:45 -0700 Subject: [PATCH 04/15] not a pointer --- schema/schema.go | 2 +- schema/site.schema.json | 1 - schema/site_stringdata.go | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/schema/schema.go b/schema/schema.go index 0e22dad743d8..43f1ed713f45 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -432,7 +432,7 @@ type SiteConfiguration struct { ParentSourcegraph *ParentSourcegraph `json:"parentSourcegraph,omitempty"` RepoListUpdateInterval int `json:"repoListUpdateInterval,omitempty"` SearchIndexEnabled *bool `json:"search.index.enabled,omitempty"` - SearchLargeFiles *[]string `json:"search.largeFiles,omitempty"` + SearchLargeFiles []string `json:"search.largeFiles,omitempty"` } // SlackNotificationsConfig description: Configuration for sending notifications to Slack. diff --git a/schema/site.schema.json b/schema/site.schema.json index d8b7e91615f0..78e0c8ba245a 100644 --- a/schema/site.schema.json +++ b/schema/site.schema.json @@ -28,7 +28,6 @@ "items": { "type": "string" }, - "!go": { "pointer": true }, "group": "Search" }, "experimentalFeatures": { diff --git a/schema/site_stringdata.go b/schema/site_stringdata.go index f80065fc9e12..d5542c7e9d66 100644 --- a/schema/site_stringdata.go +++ b/schema/site_stringdata.go @@ -33,7 +33,6 @@ const SiteSchemaJSON = `{ "items": { "type": "string" }, - "!go": { "pointer": true }, "group": "Search" }, "experimentalFeatures": { From 845935cf4707260b6b3e2ffc8661232c49f2324c Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 11 Apr 2019 11:19:56 -0700 Subject: [PATCH 05/15] use non-pointer --- cmd/frontend/internal/httpapi/internal.go | 2 +- cmd/searcher/search/store.go | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index e214878d1f76..8b797b0ff468 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -171,7 +171,7 @@ func serveConfiguration(w http.ResponseWriter, r *http.Request) error { } type searchOptions struct { - LargeFiles *[]string + LargeFiles []string } func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 4523049de6f3..0f03b04d191d 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -362,14 +362,6 @@ func (s *Store) ignoreSizeMax(name string) bool { return false } -func getLargeFilePatterns() []string { - lfp := conf.Get().SearchLargeFiles - if lfp == nil { - return []string{} - } - return *lfp -} - // stringSlicesAreEqual checks to make sure that two string slices have the same // items. func stringSlicesAreEqual(a, b []string) bool { @@ -391,17 +383,23 @@ func stringSlicesAreEqual(a, b []string) bool { // watchLargeFilesSettingChange watches for changes to the search.largeFiles // setting and clears the disk cache when it is changed. func (s *Store) watchLargeFilesChange() { + get := func() []string { return conf.Get().SearchLargeFiles } + if s.largeFilePatterns == nil { - s.largeFilePatterns = getLargeFilePatterns() + s.largeFilePatterns = get() } conf.Watch(func() { // Ensure the slices are actually different so we don't blow away the // cache needlessly. - log15.Info("config changed", "old", s.largeFilePatterns, "new", getLargeFilePatterns()) - if lfp := getLargeFilePatterns(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { + log15.Info("config changed", "old", s.largeFilePatterns, "new", get()) + if lfp := get(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { s.largeFilePatterns = lfp - s.cache.Evict(0) + + _, err := s.cache.Evict(0) + if err != nil { + log15.Warn("error evicting cache", err) + } } }) } From bc2d8b1475999a6748fd187f272fbf3221013e90 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 11 Apr 2019 14:40:51 -0700 Subject: [PATCH 06/15] clean up --- cmd/searcher/search/store.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 0f03b04d191d..aebbf3b06f70 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -19,7 +19,6 @@ import ( "github.com/sourcegraph/sourcegraph/pkg/diskcache" "github.com/sourcegraph/sourcegraph/pkg/gitserver" "github.com/sourcegraph/sourcegraph/pkg/mutablelimiter" - log15 "gopkg.in/inconshreveable/log15.v2" opentracing "github.com/opentracing/opentracing-go" "github.com/opentracing/opentracing-go/ext" @@ -248,7 +247,6 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi // any file that is a candidate for being searched (under size limit and // non-binary). func copySearchable(tr *tar.Reader, zw *zip.Writer, ignoreSizeMax func(string) bool) error { - log15.Info("copy searchable") // 32*1024 is the same size used by io.Copy buf := make([]byte, 32*1024) for { @@ -352,7 +350,6 @@ func (s *Store) watchAndEvict() { // ignoreSizeMax determines whether the max size should be ignored. It uses // the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. func (s *Store) ignoreSizeMax(name string) bool { - log15.Info(">>>>>", "name", name) for _, pattern := range s.largeFilePatterns { pattern = strings.TrimSpace(pattern) if m, _ := filepath.Match(pattern, name); m { @@ -392,13 +389,12 @@ func (s *Store) watchLargeFilesChange() { conf.Watch(func() { // Ensure the slices are actually different so we don't blow away the // cache needlessly. - log15.Info("config changed", "old", s.largeFilePatterns, "new", get()) if lfp := get(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { s.largeFilePatterns = lfp _, err := s.cache.Evict(0) if err != nil { - log15.Warn("error evicting cache", err) + log.Printf("failed to clearn searcher archives: %s", err) } } }) From 1c6d8c48973ee4fa0e2d692844af3790513e2384 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 11 Apr 2019 14:50:16 -0700 Subject: [PATCH 07/15] actually call watch func --- cmd/searcher/search/store.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index aebbf3b06f70..120b8b2ba8da 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -102,6 +102,7 @@ func (s *Store) Start() { BeforeEvict: s.zipCache.delete, } go s.watchAndEvict() + go s.watchLargeFilesChange() }) } From 6b3945e786afab9d8e8cc20deacd37e01fb5bdee Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 11 Apr 2019 15:08:03 -0700 Subject: [PATCH 08/15] use mutex to prevent race conditions --- cmd/searcher/search/store.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 120b8b2ba8da..e764619641c6 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -72,6 +72,7 @@ type Store struct { // largeFilesPatterns is a list of large file glob patterns where files that // match any pattern in the list should be searched regardless of their size. largeFilePatterns []string + lfMu sync.Mutex } // SetMaxConcurrentFetchTar sets the maximum number of concurrent calls allowed @@ -382,16 +383,21 @@ func stringSlicesAreEqual(a, b []string) bool { // setting and clears the disk cache when it is changed. func (s *Store) watchLargeFilesChange() { get := func() []string { return conf.Get().SearchLargeFiles } + set := func(f []string) { + s.lfMu.Lock() + s.largeFilePatterns = f + s.lfMu.Unlock() + } if s.largeFilePatterns == nil { - s.largeFilePatterns = get() + set(get()) } conf.Watch(func() { // Ensure the slices are actually different so we don't blow away the // cache needlessly. if lfp := get(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { - s.largeFilePatterns = lfp + set(lfp) _, err := s.cache.Evict(0) if err != nil { From 514599d90355fee04b3d986e3eb498c099de2a4b Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Fri, 12 Apr 2019 10:54:40 -0700 Subject: [PATCH 09/15] properly handle race condition --- cmd/searcher/search/store.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index e764619641c6..d18539a3d407 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -72,7 +72,7 @@ type Store struct { // largeFilesPatterns is a list of large file glob patterns where files that // match any pattern in the list should be searched regardless of their size. largeFilePatterns []string - lfMu sync.Mutex + lfMu sync.RWMutex } // SetMaxConcurrentFetchTar sets the maximum number of concurrent calls allowed @@ -352,6 +352,8 @@ func (s *Store) watchAndEvict() { // ignoreSizeMax determines whether the max size should be ignored. It uses // the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. func (s *Store) ignoreSizeMax(name string) bool { + s.lfMu.RLock() + defer s.lfMu.RUnlock() for _, pattern := range s.largeFilePatterns { pattern = strings.TrimSpace(pattern) if m, _ := filepath.Match(pattern, name); m { From acdfa3f52465a87e9f5ea8a3df758016004f25cc Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Fri, 12 Apr 2019 11:26:05 -0700 Subject: [PATCH 10/15] add comment for internal search config endpoint. --- cmd/frontend/internal/httpapi/internal.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index 8b797b0ff468..a3e68bb55d70 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -174,6 +174,11 @@ type searchOptions struct { LargeFiles []string } +// serveSearchConfiguration is _only_ used by the zoekt index server. Zoekt does +// not depend on frontend and therefore does not have access to `conf.Watch`. +// Additionally, it only cares about certain search specific settings so this +// search specific endpoint is used rather than serving the entire site settings +// from /.internal/configuration. func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { largeFiles := conf.Get().SearchLargeFiles opts := searchOptions{ From 6a065e2734d8100002dd122323ce5a8739ea4421 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Mon, 15 Apr 2019 18:18:00 -0700 Subject: [PATCH 11/15] encode large files settings into cache key for better cache invalidation --- cmd/frontend/internal/httpapi/internal.go | 2 +- cmd/searcher/search/store.go | 66 +++-------------------- 2 files changed, 9 insertions(+), 59 deletions(-) diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index a3e68bb55d70..917143bd2eee 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -186,7 +186,7 @@ func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { } err := json.NewEncoder(w).Encode(opts) if err != nil { - return errors.Wrap(err, "Encode") + return errors.Wrap(err, "encode") } return nil } diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index d18539a3d407..512c2c93ef9c 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -7,6 +7,7 @@ import ( "context" "crypto/sha256" "encoding/hex" + "encoding/json" "io" "log" "path/filepath" @@ -68,11 +69,6 @@ type Store struct { // zipCache provides efficient access to repo zip files. zipCache zipCache - - // largeFilesPatterns is a list of large file glob patterns where files that - // match any pattern in the list should be searched regardless of their size. - largeFilePatterns []string - lfMu sync.RWMutex } // SetMaxConcurrentFetchTar sets the maximum number of concurrent calls allowed @@ -103,7 +99,6 @@ func (s *Store) Start() { BeforeEvict: s.zipCache.delete, } go s.watchAndEvict() - go s.watchLargeFilesChange() }) } @@ -129,8 +124,13 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. return "", errors.Errorf("commit must be resolved (repo=%q, commit=%q)", repo.Name, commit) } + largeFilePatterns := conf.Get().SearchLargeFiles + lfpBytes, err := json.Marshal(largeFilePatterns) + if err != nil { + return "", errors.Errorf("error marshalling large file patterns: %v", err) + } // key is a sha256 hash since we want to use it for the disk name - h := sha256.Sum256([]byte(string(repo.Name) + " " + string(commit))) + h := sha256.Sum256([]byte(string(repo.Name) + " " + string(commit) + " " + string(lfpBytes))) key := hex.EncodeToString(h[:]) span.LogKV("key", key) @@ -285,8 +285,6 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer, ignoreSizeMax func(string) b return err } - _ = ignoreSizeMax(hdr.Name) - // We do not search the content of large files unless they are // whitelisted. if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name) { @@ -352,9 +350,7 @@ func (s *Store) watchAndEvict() { // ignoreSizeMax determines whether the max size should be ignored. It uses // the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. func (s *Store) ignoreSizeMax(name string) bool { - s.lfMu.RLock() - defer s.lfMu.RUnlock() - for _, pattern := range s.largeFilePatterns { + for _, pattern := range conf.Get().SearchLargeFiles { pattern = strings.TrimSpace(pattern) if m, _ := filepath.Match(pattern, name); m { return true @@ -363,52 +359,6 @@ func (s *Store) ignoreSizeMax(name string) bool { return false } -// stringSlicesAreEqual checks to make sure that two string slices have the same -// items. -func stringSlicesAreEqual(a, b []string) bool { - if len(a) != len(b) { - return false - } - aMap := map[string]struct{}{} - for i := range a { - aMap[a[i]] = struct{}{} - } - for i := range b { - if _, ok := aMap[b[i]]; !ok { - return false - } - } - return true -} - -// watchLargeFilesSettingChange watches for changes to the search.largeFiles -// setting and clears the disk cache when it is changed. -func (s *Store) watchLargeFilesChange() { - get := func() []string { return conf.Get().SearchLargeFiles } - set := func(f []string) { - s.lfMu.Lock() - s.largeFilePatterns = f - s.lfMu.Unlock() - } - - if s.largeFilePatterns == nil { - set(get()) - } - - conf.Watch(func() { - // Ensure the slices are actually different so we don't blow away the - // cache needlessly. - if lfp := get(); !stringSlicesAreEqual(lfp, s.largeFilePatterns) { - set(lfp) - - _, err := s.cache.Evict(0) - if err != nil { - log.Printf("failed to clearn searcher archives: %s", err) - } - } - }) -} - var ( cacheSizeBytes = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "searcher", From 90857213e16d14e8f593fdc3d1f238e386ab7b0e Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Tue, 16 Apr 2019 15:24:59 -0700 Subject: [PATCH 12/15] address review comments --- cmd/frontend/internal/httpapi/internal.go | 8 +++--- cmd/searcher/search/store.go | 10 ++++---- cmd/searcher/search/store_test.go | 31 +++++++++++++++++++++++ 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/cmd/frontend/internal/httpapi/internal.go b/cmd/frontend/internal/httpapi/internal.go index 917143bd2eee..9116cc5e06e7 100644 --- a/cmd/frontend/internal/httpapi/internal.go +++ b/cmd/frontend/internal/httpapi/internal.go @@ -170,10 +170,6 @@ func serveConfiguration(w http.ResponseWriter, r *http.Request) error { return nil } -type searchOptions struct { - LargeFiles []string -} - // serveSearchConfiguration is _only_ used by the zoekt index server. Zoekt does // not depend on frontend and therefore does not have access to `conf.Watch`. // Additionally, it only cares about certain search specific settings so this @@ -181,7 +177,9 @@ type searchOptions struct { // from /.internal/configuration. func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error { largeFiles := conf.Get().SearchLargeFiles - opts := searchOptions{ + opts := struct { + LargeFiles []string + }{ LargeFiles: largeFiles, } err := json.NewEncoder(w).Encode(opts) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 512c2c93ef9c..49d8991ffd20 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -234,7 +234,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi defer r.Close() tr := tar.NewReader(r) zw := zip.NewWriter(pw) - err := copySearchable(tr, zw, s.ignoreSizeMax) + err := copySearchable(tr, zw) if err1 := zw.Close(); err == nil { err = err1 } @@ -248,7 +248,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi // copySearchable copies searchable files from tr to zw. A searchable file is // any file that is a candidate for being searched (under size limit and // non-binary). -func copySearchable(tr *tar.Reader, zw *zip.Writer, ignoreSizeMax func(string) bool) error { +func copySearchable(tr *tar.Reader, zw *zip.Writer) error { // 32*1024 is the same size used by io.Copy buf := make([]byte, 32*1024) for { @@ -287,7 +287,7 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer, ignoreSizeMax func(string) b // We do not search the content of large files unless they are // whitelisted. - if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name) { + if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name, conf.Get().SearchLargeFiles) { continue } @@ -349,8 +349,8 @@ func (s *Store) watchAndEvict() { // ignoreSizeMax determines whether the max size should be ignored. It uses // the glob syntax found here: https://golang.org/pkg/path/filepath/#Match. -func (s *Store) ignoreSizeMax(name string) bool { - for _, pattern := range conf.Get().SearchLargeFiles { +func ignoreSizeMax(name string, patterns []string) bool { + for _, pattern := range patterns { pattern = strings.TrimSpace(pattern) if m, _ := filepath.Match(pattern, name); m { return true diff --git a/cmd/searcher/search/store_test.go b/cmd/searcher/search/store_test.go index 1587912ccb68..9f6aac22732c 100644 --- a/cmd/searcher/search/store_test.go +++ b/cmd/searcher/search/store_test.go @@ -94,6 +94,37 @@ func TestPrepareZip_fetchTarFail(t *testing.T) { } } +func TestIngoreSizeMax(t *testing.T) { + patterns := []string{ + "foo", + "foo.*", + "foo_*", + "*.foo", + "bar.baz", + } + tests := []struct { + name string + ignored bool + }{ + // Pass + {"foo", true}, + {"foo.bar", true}, + {"foo_bar", true}, + {"bar.baz", true}, + {"bar.foo", true}, + // Fail + {"baz.foo.bar", false}, + {"bar_baz", false}, + {"bar.foo", false}, + } + + for _, test := range tests { + if got, want := ignoreSizeMax(test.name, patterns), test.ignored; got != want { + t.Errorf("got %v want %v", got, want) + } + } +} + func tmpStore(t *testing.T) (*Store, func()) { d, err := ioutil.TempDir("", "search_test") if err != nil { From 87da254e6045a8ff532998852c34418e27ad1a82 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Wed, 17 Apr 2019 17:31:06 -0700 Subject: [PATCH 13/15] fix test --- cmd/searcher/search/store_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/searcher/search/store_test.go b/cmd/searcher/search/store_test.go index 9f6aac22732c..956979effe71 100644 --- a/cmd/searcher/search/store_test.go +++ b/cmd/searcher/search/store_test.go @@ -115,12 +115,12 @@ func TestIngoreSizeMax(t *testing.T) { // Fail {"baz.foo.bar", false}, {"bar_baz", false}, - {"bar.foo", false}, + {"baz.baz", false}, } for _, test := range tests { if got, want := ignoreSizeMax(test.name, patterns), test.ignored; got != want { - t.Errorf("got %v want %v", got, want) + t.Errorf("case %s got %v want %v", test.name, got, want) } } } From 65bc9b3a2fb01128d5f284cbe815883062b7d7fc Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 18 Apr 2019 08:55:45 -0700 Subject: [PATCH 14/15] address review comments --- cmd/searcher/search/store.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cmd/searcher/search/store.go b/cmd/searcher/search/store.go index 49d8991ffd20..9cfb32a9a039 100644 --- a/cmd/searcher/search/store.go +++ b/cmd/searcher/search/store.go @@ -7,7 +7,7 @@ import ( "context" "crypto/sha256" "encoding/hex" - "encoding/json" + "fmt" "io" "log" "path/filepath" @@ -125,12 +125,9 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. } largeFilePatterns := conf.Get().SearchLargeFiles - lfpBytes, err := json.Marshal(largeFilePatterns) - if err != nil { - return "", errors.Errorf("error marshalling large file patterns: %v", err) - } + // key is a sha256 hash since we want to use it for the disk name - h := sha256.Sum256([]byte(string(repo.Name) + " " + string(commit) + " " + string(lfpBytes))) + h := sha256.Sum256([]byte(fmt.Sprintf("%q %q %q", repo.Name, commit, largeFilePatterns))) key := hex.EncodeToString(h[:]) span.LogKV("key", key) @@ -146,7 +143,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. // since we're just going to close it again immediately. bgctx := opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx)) f, err := s.cache.Open(bgctx, key, func(ctx context.Context) (io.ReadCloser, error) { - return s.fetch(ctx, repo, commit) + return s.fetch(ctx, repo, commit, largeFilePatterns) }) var path string if f != nil { @@ -173,7 +170,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api. // fetch fetches an archive from the network and stores it on disk. It does // not populate the in-memory cache. You should probably be calling // prepareZip. -func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID) (rc io.ReadCloser, err error) { +func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID, largeFilePatterns []string) (rc io.ReadCloser, err error) { fetchQueueSize.Inc() ctx, releaseFetchLimiter, err := s.fetchLimiter.Acquire(ctx) // Acquire concurrent fetches semaphore if err != nil { @@ -234,7 +231,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi defer r.Close() tr := tar.NewReader(r) zw := zip.NewWriter(pw) - err := copySearchable(tr, zw) + err := copySearchable(tr, zw, largeFilePatterns) if err1 := zw.Close(); err == nil { err = err1 } @@ -248,7 +245,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi // copySearchable copies searchable files from tr to zw. A searchable file is // any file that is a candidate for being searched (under size limit and // non-binary). -func copySearchable(tr *tar.Reader, zw *zip.Writer) error { +func copySearchable(tr *tar.Reader, zw *zip.Writer, largeFilePatterns []string) error { // 32*1024 is the same size used by io.Copy buf := make([]byte, 32*1024) for { @@ -287,7 +284,7 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer) error { // We do not search the content of large files unless they are // whitelisted. - if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name, conf.Get().SearchLargeFiles) { + if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name, largeFilePatterns) { continue } From 744d5d3f6d5ed912dd8da0d811151917fda95e99 Mon Sep 17 00:00:00 2001 From: Isaac Snow Date: Thu, 18 Apr 2019 09:06:57 -0700 Subject: [PATCH 15/15] add example and link to settings --- schema/site.schema.json | 5 +++-- schema/site_stringdata.go | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/schema/site.schema.json b/schema/site.schema.json index 78e0c8ba245a..fda4cb5363fa 100644 --- a/schema/site.schema.json +++ b/schema/site.schema.json @@ -23,12 +23,13 @@ "group": "Search" }, "search.largeFiles": { - "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size.", + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. The glob pattern syntax can be found here: https://golang.org/pkg/path/filepath/#Match.", "type": "array", "items": { "type": "string" }, - "group": "Search" + "group": "Search", + "examples": [["go.sum", "package-lock.json", "*.thrift"]] }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.", diff --git a/schema/site_stringdata.go b/schema/site_stringdata.go index d5542c7e9d66..9965d7cdbf76 100644 --- a/schema/site_stringdata.go +++ b/schema/site_stringdata.go @@ -28,12 +28,13 @@ const SiteSchemaJSON = `{ "group": "Search" }, "search.largeFiles": { - "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size.", + "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. The glob pattern syntax can be found here: https://golang.org/pkg/path/filepath/#Match.", "type": "array", "items": { "type": "string" }, - "group": "Search" + "group": "Search", + "examples": [["go.sum", "package-lock.json", "*.thrift"]] }, "experimentalFeatures": { "description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.",