Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

1624 index large files #3337

Merged
merged 15 commits into from
Apr 18, 2019
1 change: 1 addition & 0 deletions cmd/frontend/internal/httpapi/httpapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ func NewInternalHandler(m *mux.Router) http.Handler {
m.Get(apirouter.Telemetry).Handler(trace.TraceRoute(telemetryHandler))
m.Get(apirouter.GraphQL).Handler(trace.TraceRoute(handler(serveGraphQL)))
m.Get(apirouter.Configuration).Handler(trace.TraceRoute(handler(serveConfiguration)))
m.Get(apirouter.SearchConfiguration).Handler(trace.TraceRoute(handler(serveSearchConfiguration)))
m.Path("/ping").Methods("GET").Name("ping").HandlerFunc(handlePing)

m.NotFoundHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
19 changes: 19 additions & 0 deletions cmd/frontend/internal/httpapi/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,25 @@ func serveConfiguration(w http.ResponseWriter, r *http.Request) error {
return nil
}

// serveSearchConfiguration is _only_ used by the zoekt index server. Zoekt does
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Go doc comments aren't supposed to include markdownish things like foo or bar.
See golang/go#16666 for some discussion about this.

Copy link
Contributor Author

@ijsnow ijsnow Apr 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm optimizing for reading in sourcegraph. Sourcegraph tooltips render as markdown. See this comment in sourcegraph here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's probably a bug.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it's a sourcegraph feature.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah lots of people use markdown in godoc even though it isn't supported. Luckily godoc has so little features it doesn't ever conflict.

// not depend on frontend and therefore does not have access to `conf.Watch`.
// Additionally, it only cares about certain search specific settings so this
// search specific endpoint is used rather than serving the entire site settings
// from /.internal/configuration.
func serveSearchConfiguration(w http.ResponseWriter, r *http.Request) error {
largeFiles := conf.Get().SearchLargeFiles
opts := struct {
LargeFiles []string
}{
LargeFiles: largeFiles,
}
err := json.NewEncoder(w).Encode(opts)
if err != nil {
return errors.Wrap(err, "encode")
}
return nil
}

func serveReposList(w http.ResponseWriter, r *http.Request) error {
var opt db.ReposListOptions
err := json.NewDecoder(r.Body).Decode(&opt)
Expand Down
2 changes: 2 additions & 0 deletions cmd/frontend/internal/httpapi/router/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
ReposListEnabled = "internal.repos.list-enabled"
ReposUpdateMetadata = "internal.repos.update-metadata"
Configuration = "internal.configuration"
SearchConfiguration = "internal.search-configuration"
ExternalServiceConfigs = "internal.external-services.configs"
ExternalServicesList = "internal.external-services.list"
)
Expand Down Expand Up @@ -104,6 +105,7 @@ func NewInternal(base *mux.Router) *mux.Router {
base.Path("/repos/update-metadata").Methods("POST").Name(ReposUpdateMetadata)
base.Path("/repos/{RepoName:.*}").Methods("POST").Name(ReposGetByName)
base.Path("/configuration").Methods("POST").Name(Configuration)
base.Path("/search/configuration").Methods("GET").Name(SearchConfiguration)
addRegistryRoute(base)
addGraphQLRoute(base)
addTelemetryRoute(base)
Expand Down
33 changes: 26 additions & 7 deletions cmd/searcher/search/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@ import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"log"
"path/filepath"
"strings"
"sync"
"time"

"github.com/sourcegraph/sourcegraph/pkg/api"
"github.com/sourcegraph/sourcegraph/pkg/conf"
"github.com/sourcegraph/sourcegraph/pkg/diskcache"
"github.com/sourcegraph/sourcegraph/pkg/gitserver"
"github.com/sourcegraph/sourcegraph/pkg/mutablelimiter"
Expand Down Expand Up @@ -120,8 +124,10 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api.
return "", errors.Errorf("commit must be resolved (repo=%q, commit=%q)", repo.Name, commit)
}

largeFilePatterns := conf.Get().SearchLargeFiles

// key is a sha256 hash since we want to use it for the disk name
h := sha256.Sum256([]byte(string(repo.Name) + " " + string(commit)))
h := sha256.Sum256([]byte(fmt.Sprintf("%q %q %q", repo.Name, commit, largeFilePatterns)))
key := hex.EncodeToString(h[:])
span.LogKV("key", key)

Expand All @@ -137,7 +143,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api.
// since we're just going to close it again immediately.
bgctx := opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
f, err := s.cache.Open(bgctx, key, func(ctx context.Context) (io.ReadCloser, error) {
return s.fetch(ctx, repo, commit)
return s.fetch(ctx, repo, commit, largeFilePatterns)
})
var path string
if f != nil {
Expand All @@ -164,7 +170,7 @@ func (s *Store) prepareZip(ctx context.Context, repo gitserver.Repo, commit api.
// fetch fetches an archive from the network and stores it on disk. It does
// not populate the in-memory cache. You should probably be calling
// prepareZip.
func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID) (rc io.ReadCloser, err error) {
func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.CommitID, largeFilePatterns []string) (rc io.ReadCloser, err error) {
fetchQueueSize.Inc()
ctx, releaseFetchLimiter, err := s.fetchLimiter.Acquire(ctx) // Acquire concurrent fetches semaphore
if err != nil {
Expand Down Expand Up @@ -225,7 +231,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi
defer r.Close()
tr := tar.NewReader(r)
zw := zip.NewWriter(pw)
err := copySearchable(tr, zw)
err := copySearchable(tr, zw, largeFilePatterns)
if err1 := zw.Close(); err == nil {
err = err1
}
Expand All @@ -239,7 +245,7 @@ func (s *Store) fetch(ctx context.Context, repo gitserver.Repo, commit api.Commi
// copySearchable copies searchable files from tr to zw. A searchable file is
// any file that is a candidate for being searched (under size limit and
// non-binary).
func copySearchable(tr *tar.Reader, zw *zip.Writer) error {
func copySearchable(tr *tar.Reader, zw *zip.Writer, largeFilePatterns []string) error {
// 32*1024 is the same size used by io.Copy
buf := make([]byte, 32*1024)
for {
Expand Down Expand Up @@ -276,8 +282,9 @@ func copySearchable(tr *tar.Reader, zw *zip.Writer) error {
return err
}

// We do not search the content of large files
if hdr.Size > maxFileSize {
// We do not search the content of large files unless they are
// whitelisted.
if hdr.Size > maxFileSize && !ignoreSizeMax(hdr.Name, largeFilePatterns) {
continue
}

Expand Down Expand Up @@ -337,6 +344,18 @@ func (s *Store) watchAndEvict() {
}
}

// ignoreSizeMax determines whether the max size should be ignored. It uses
// the glob syntax found here: https://golang.org/pkg/path/filepath/#Match.
func ignoreSizeMax(name string, patterns []string) bool {
for _, pattern := range patterns {
pattern = strings.TrimSpace(pattern)
if m, _ := filepath.Match(pattern, name); m {
return true
}
}
return false
}

var (
cacheSizeBytes = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "searcher",
Expand Down
31 changes: 31 additions & 0 deletions cmd/searcher/search/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,37 @@ func TestPrepareZip_fetchTarFail(t *testing.T) {
}
}

func TestIngoreSizeMax(t *testing.T) {
patterns := []string{
"foo",
"foo.*",
"foo_*",
"*.foo",
"bar.baz",
}
tests := []struct {
name string
ignored bool
}{
// Pass
{"foo", true},
{"foo.bar", true},
{"foo_bar", true},
{"bar.baz", true},
{"bar.foo", true},
// Fail
{"baz.foo.bar", false},
{"bar_baz", false},
{"baz.baz", false},
}

for _, test := range tests {
if got, want := ignoreSizeMax(test.name, patterns), test.ignored; got != want {
t.Errorf("case %s got %v want %v", test.name, got, want)
}
}
}

func tmpStore(t *testing.T) (*Store, func()) {
d, err := ioutil.TempDir("", "search_test")
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions schema/schema.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions schema/site.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
"!go": { "pointer": true },
"group": "Search"
},
"search.largeFiles": {
"description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. The glob pattern syntax can be found here: https://golang.org/pkg/path/filepath/#Match.",
"type": "array",
"items": {
"type": "string"
},
"group": "Search",
"examples": [["go.sum", "package-lock.json", "*.thrift"]]
},
"experimentalFeatures": {
"description": "Experimental features to enable or disable. Features that are now enabled by default are marked as deprecated.",
"type": "object",
Expand Down
9 changes: 9 additions & 0 deletions schema/site_stringdata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.