Optimize semantic token extraction logic (#4050)

soulomoon · web-flow · commit 3c511b0b0351 · 2024-02-07T18:07:13.000Z
A follow up of #3958 , we have added a tokenizor to walk the hieAst along with the file rope, it means we no longer need to do the detour of storing temperal result as Map Range (Set identifier), instead we can optimize by fusing most of the logic into tokenizer and return [(Range, HsSemanticTokenType)] directly.
diff --git a/haskell-language-server.cabal b/haskell-language-server.cabal
@@ -1592,6 +1592,7 @@ library hls-semantic-tokens-plugin
     , syb
     , array
     , deepseq
+    , dlist
     , hls-graph == 2.6.0.0
     , template-haskell
     , data-default
diff --git a/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Internal.hs b/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Internal.hs
@@ -48,7 +48,7 @@ import           Ide.Plugin.Error                         (PluginError (PluginIn
 import           Ide.Plugin.SemanticTokens.Mappings
 import           Ide.Plugin.SemanticTokens.Query
 import           Ide.Plugin.SemanticTokens.SemanticConfig (mkSemanticConfigFunctions)
-import           Ide.Plugin.SemanticTokens.Tokenize       (hieAstSpanIdentifiers)
+import           Ide.Plugin.SemanticTokens.Tokenize       (computeRangeHsSemanticTokenTypeList)
 import           Ide.Plugin.SemanticTokens.Types
 import           Ide.Types
 import qualified Language.LSP.Protocol.Lens               as L
@@ -69,8 +69,8 @@ computeSemanticTokens :: Recorder (WithPriority SemanticLog) -> PluginId -> IdeS
 computeSemanticTokens recorder pid _ nfp = do
   config <- lift $ useSemanticConfigAction pid
   logWith recorder Debug (LogConfig config)
-  (RangeHsSemanticTokenTypes {rangeSemanticMap}, mapping) <- useWithStaleE GetSemanticTokens nfp
-  withExceptT PluginInternalError $ liftEither $ rangeSemanticMapSemanticTokens config mapping rangeSemanticMap
+  (RangeHsSemanticTokenTypes {rangeSemanticList}, mapping) <- useWithStaleE GetSemanticTokens nfp
+  withExceptT PluginInternalError $ liftEither $ rangeSemanticsSemanticTokens config mapping rangeSemanticList
 
 semanticTokensFull :: Recorder (WithPriority SemanticLog) -> PluginMethodHandler IdeState 'Method_TextDocumentSemanticTokensFull
 semanticTokensFull recorder state pid param = do
@@ -96,26 +96,8 @@ getSemanticTokensRule recorder =
     (DKMap {getTyThingMap}, _) <- lift $ useWithStale_ GetDocMap nfp
     ast <- handleMaybe (LogNoAST $ show nfp) $ getAsts hieAst M.!? (HiePath . mkFastString . fromNormalizedFilePath) nfp
     virtualFile <- handleMaybeM LogNoVF $ getVirtualFile nfp
-    -- get current location from the old ones
-    let spanIdMap = M.filter (not . null) $ hieAstSpanIdentifiers virtualFile ast
-    let names = S.unions $ M.elems spanIdMap
-    let localSemanticMap = mkLocalIdSemanticFromAst names (hieKindFunMasksKind hieKind) refMap
-    -- get imported name semantic map
-    let importedIdSemanticMap = M.mapMaybe id
-            $ M.fromSet (getTypeThing getTyThingMap) (names `S.difference` M.keysSet localSemanticMap)
-    let sMap = M.unionWith (<>) importedIdSemanticMap localSemanticMap
-    let rangeTokenType = extractSemanticTokensFromNames sMap spanIdMap
-    return $ RangeHsSemanticTokenTypes rangeTokenType
-  where
-    getTypeThing ::
-      NameEnv TyThing ->
-      Identifier ->
-      Maybe HsSemanticTokenType
-    getTypeThing tyThingMap n
-      | (Right name) <- n =
-          let tyThing = lookupNameEnv tyThingMap name
-           in (tyThing >>= tyThingSemantic)
-     | otherwise = Nothing
+    let hsFinder = idSemantic getTyThingMap (hieKindFunMasksKind hieKind) refMap
+    return $ computeRangeHsSemanticTokenTypeList hsFinder virtualFile ast
 
 -- | Persistent rule to ensure that semantic tokens doesn't block on startup
 persistentGetSemanticTokensRule :: Rules ()
diff --git a/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Mappings.hs b/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Mappings.hs
@@ -32,10 +32,6 @@ import           Language.LSP.VFS                hiding (line)
 
 -- * 0. Mapping name to Hs semantic token type.
 
-idInfixOperator :: Identifier -> Maybe HsSemanticTokenType
-idInfixOperator (Right name) = nameInfixOperator name
-idInfixOperator _            = Nothing
-
 nameInfixOperator :: Name -> Maybe HsSemanticTokenType
 nameInfixOperator name | isSymOcc (nameOccName name) = Just TOperator
 nameInfixOperator _ = Nothing
diff --git a/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Query.hs b/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Query.hs
@@ -5,10 +5,10 @@
 -- The query module is used to query the semantic tokens from the AST
 module Ide.Plugin.SemanticTokens.Query where
 
+import           Control.Applicative                  ((<|>))
 import           Data.Foldable                        (fold)
 import qualified Data.Map.Strict                      as M
 import           Data.Maybe                           (listToMaybe, mapMaybe)
-import           Data.Set                             (Set)
 import qualified Data.Set                             as Set
 import           Data.Text                            (Text)
 import           Development.IDE.Core.PositionMapping (PositionMapping,
@@ -17,8 +17,7 @@ import           Development.IDE.GHC.Compat
 import           Ide.Plugin.SemanticTokens.Mappings
 import           Ide.Plugin.SemanticTokens.Types      (HieFunMaskKind,
                                                        HsSemanticTokenType (TModule),
-                                                       IdSemanticMap,
-                                                       RangeIdSetMap,
+                                                       RangeSemanticTokenTypeList,
                                                        SemanticTokensConfig)
 import           Language.LSP.Protocol.Types          (Position (Position),
                                                        Range (Range),
@@ -30,44 +29,48 @@ import           Prelude                              hiding (length, span)
 
 ---------------------------------------------------------
 
--- * extract semantic map from HieAst for local variables
+-- * extract semantic
 
 ---------------------------------------------------------
 
-mkLocalIdSemanticFromAst :: Set Identifier -> HieFunMaskKind a -> RefMap a -> IdSemanticMap
-mkLocalIdSemanticFromAst names hieKind rm = M.mapMaybe (idIdSemanticFromHie hieKind rm) $ M.fromSet id names
+idSemantic :: forall a. NameEnv TyThing -> HieFunMaskKind a -> RefMap a -> Identifier -> Maybe HsSemanticTokenType
+idSemantic _ _ _ (Left _) = Just TModule
+idSemantic tyThingMap hieKind rm (Right n) =
+    nameSemanticFromHie hieKind rm n -- local name
+    <|> (lookupNameEnv tyThingMap n >>= tyThingSemantic) -- global name
 
-idIdSemanticFromHie :: forall a. HieFunMaskKind a -> RefMap a -> Identifier -> Maybe HsSemanticTokenType
-idIdSemanticFromHie _ _ (Left _) = Just TModule
-idIdSemanticFromHie hieKind rm ns = do
-  idSemanticFromRefMap rm ns
+
+---------------------------------------------------------
+
+-- * extract semantic from HieAst for local variables
+
+---------------------------------------------------------
+
+nameSemanticFromHie :: forall a. HieFunMaskKind a -> RefMap a -> Name -> Maybe HsSemanticTokenType
+nameSemanticFromHie hieKind rm n = do
+  idSemanticFromRefMap rm (Right n)
   where
     idSemanticFromRefMap :: RefMap a -> Identifier -> Maybe HsSemanticTokenType
     idSemanticFromRefMap rm' name' = do
       spanInfos <- M.lookup name' rm'
       let typeTokenType = foldMap (typeSemantic hieKind) $ listToMaybe $ mapMaybe (identType . snd) spanInfos
       contextInfoTokenType <- foldMap (contextInfosMaybeTokenType . identInfo . snd) spanInfos
-      fold [typeTokenType, Just contextInfoTokenType, idInfixOperator ns]
+      fold [typeTokenType, Just contextInfoTokenType, nameInfixOperator n]
 
     contextInfosMaybeTokenType :: Set.Set ContextInfo -> Maybe HsSemanticTokenType
     contextInfosMaybeTokenType details = foldMap infoTokenType (Set.toList details)
 
 
 -------------------------------------------------
 
--- * extract semantic tokens from IdSemanticMap
+-- * extract lsp semantic tokens from RangeSemanticTokenTypeList
 
 -------------------------------------------------
 
-extractSemanticTokensFromNames :: IdSemanticMap -> RangeIdSetMap -> M.Map Range HsSemanticTokenType
-extractSemanticTokensFromNames nsm = M.mapMaybe (foldMap (`M.lookup` nsm))
-
-rangeSemanticMapSemanticTokens :: SemanticTokensConfig -> PositionMapping -> M.Map Range HsSemanticTokenType -> Either Text SemanticTokens
-rangeSemanticMapSemanticTokens stc mapping =
+rangeSemanticsSemanticTokens :: SemanticTokensConfig -> PositionMapping -> RangeSemanticTokenTypeList -> Either Text SemanticTokens
+rangeSemanticsSemanticTokens stc mapping =
   makeSemanticTokens defaultSemanticTokensLegend
-    . mapMaybe (\(range, ty) -> flip toAbsSemanticToken ty <$> range)
-    . M.toAscList
-    . M.mapKeys (toCurrentRange mapping)
+    . mapMaybe (\(ran, tk) -> toAbsSemanticToken <$> toCurrentRange mapping ran <*> return tk)
   where
     toAbsSemanticToken :: Range -> HsSemanticTokenType -> SemanticTokenAbsolute
     toAbsSemanticToken (Range (Position startLine startColumn) (Position _endLine endColumn)) tokenType =
diff --git a/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Tokenize.hs b/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Tokenize.hs
@@ -1,18 +1,19 @@
 {-# LANGUAGE OverloadedRecordDot #-}
 {-# LANGUAGE OverloadedStrings   #-}
 
-module Ide.Plugin.SemanticTokens.Tokenize (hieAstSpanIdentifiers) where
+module Ide.Plugin.SemanticTokens.Tokenize (computeRangeHsSemanticTokenTypeList) where
 
 import           Control.Lens                     (Identity (runIdentity))
-import           Control.Monad                    (forM_, guard)
+import           Control.Monad                    (foldM, guard)
 import           Control.Monad.State.Strict       (MonadState (get),
                                                    MonadTrans (lift),
-                                                   execStateT, modify, put)
-import           Control.Monad.Trans.State.Strict (StateT)
+                                                   evalStateT, modify, put)
+import           Control.Monad.Trans.State.Strict (StateT, runStateT)
 import           Data.Char                        (isAlphaNum)
+import           Data.DList                       (DList)
+import qualified Data.DList                       as DL
 import qualified Data.Map.Strict                  as M
 import qualified Data.Map.Strict                  as Map
-import qualified Data.Set                         as S
 import           Data.Text                        (Text)
 import qualified Data.Text                        as T
 import qualified Data.Text.Rope                   as Char
@@ -22,95 +23,100 @@ import           Data.Text.Utf16.Rope.Mixed       (Rope)
 import qualified Data.Text.Utf16.Rope.Mixed       as Rope
 import           Development.IDE.GHC.Compat
 import           Development.IDE.GHC.Error        (realSrcSpanToCodePointRange)
-import           Ide.Plugin.SemanticTokens.Types  (RangeIdSetMap)
+import           Ide.Plugin.SemanticTokens.Types  (HsSemanticTokenType (TModule),
+                                                   RangeHsSemanticTokenTypes (..))
 import           Language.LSP.Protocol.Types      (Position (Position),
                                                    Range (Range), UInt, mkRange)
 import           Language.LSP.VFS                 hiding (line)
 import           Prelude                          hiding (length, span)
 
 type Tokenizer m a = StateT PTokenState m a
+type HsSemanticLookup = Identifier -> Maybe HsSemanticTokenType
 
 
 data PTokenState = PTokenState
-  { rangeIdSetMap  :: !RangeIdSetMap,
-    rope           :: !Rope, -- the remains of rope we are working on
-    cursor         :: !Char.Position, -- the cursor position of the current rope to the start of the original file in code point position
-    columnsInUtf16 :: !UInt -- the column of the start of the current rope in utf16
+  {
+    rope             :: !Rope -- the remains of rope we are working on
+    , cursor         :: !Char.Position -- the cursor position of the current rope to the start of the original file in code point position
+    , columnsInUtf16 :: !UInt -- the column of the start of the current rope in utf16
   }
 
-runTokenizer :: (Monad m) => Tokenizer m a -> PTokenState -> m RangeIdSetMap
-runTokenizer p st = rangeIdSetMap <$> execStateT p st
-
 data SplitResult
   = NoSplit (Text, Range) -- does not need to split, token text, token range
   | Split (Text, Range, Range) -- token text, prefix range(module range), token range
   deriving (Show)
 
+getSplitTokenText :: SplitResult -> Text
+getSplitTokenText (NoSplit (t, _))  = t
+getSplitTokenText (Split (t, _, _)) = t
+
 
 mkPTokenState :: VirtualFile -> PTokenState
 mkPTokenState vf =
   PTokenState
-    { rangeIdSetMap = mempty,
+    {
       rope = Rope.fromText $ toText vf._file_text,
       cursor = Char.Position 0 0,
       columnsInUtf16 = 0
     }
 
-addRangeIdSetMap :: (Monad m) => Range -> Identifier -> Tokenizer m ()
-addRangeIdSetMap r i = modify $ \s -> s {rangeIdSetMap = Map.insertWith (<>) r (S.singleton i) $ rangeIdSetMap s}
-
--- lift a Tokenizer Maybe () to Tokenizer m (),
--- if the Maybe is Nothing, do nothing, recover the state
--- if the Maybe is Just (), do the action, and keep the state
-liftMaybeM :: (Monad m) => Tokenizer Maybe () -> Tokenizer m ()
+-- lift a Tokenizer Maybe a to Tokenizer m a,
+-- if the Maybe is Nothing, do nothing, recover the state, and return the mempty value
+-- if the Maybe is Just x, do the action, and keep the state, and return x
+liftMaybeM :: (Monad m, Monoid a) => Tokenizer Maybe a -> Tokenizer m a
 liftMaybeM p = do
   st <- get
-  forM_ (execStateT p st) put
+  maybe (return mempty) (\(ans, st') -> put st' >> return ans) $ runStateT p st
 
-hieAstSpanIdentifiers :: VirtualFile -> HieAST a -> RangeIdSetMap
-hieAstSpanIdentifiers vf ast = runIdentity $ runTokenizer (foldAst ast) (mkPTokenState vf)
+foldMapM :: (Monad m, Monoid b, Foldable t) => (a -> m b) -> t a -> m b
+foldMapM f ta = foldM (\b a -> mappend b <$> f a) mempty ta
 
+computeRangeHsSemanticTokenTypeList :: HsSemanticLookup -> VirtualFile -> HieAST a -> RangeHsSemanticTokenTypes
+computeRangeHsSemanticTokenTypeList lookupHsTokenType vf ast =
+    RangeHsSemanticTokenTypes $ DL.toList $ runIdentity $ evalStateT (foldAst lookupHsTokenType ast) (mkPTokenState vf)
 -- | foldAst
 -- visit every leaf node in the ast in depth first order
-foldAst :: (Monad m) => HieAST t -> Tokenizer m ()
-foldAst ast = if null (nodeChildren ast)
-  then liftMaybeM (visitLeafIds ast)
-  else mapM_ foldAst $ nodeChildren ast
+foldAst :: (Monad m) => HsSemanticLookup -> HieAST t -> Tokenizer m (DList (Range, HsSemanticTokenType))
+foldAst lookupHsTokenType ast = if null (nodeChildren ast)
+  then liftMaybeM (visitLeafIds lookupHsTokenType ast)
+  else foldMapM (foldAst lookupHsTokenType) $ nodeChildren ast
 
-visitLeafIds :: HieAST t -> Tokenizer Maybe ()
-visitLeafIds leaf = liftMaybeM $ do
+visitLeafIds :: HsSemanticLookup -> HieAST t -> Tokenizer Maybe (DList (Range, HsSemanticTokenType))
+visitLeafIds lookupHsTokenType leaf = liftMaybeM $ do
   let span = nodeSpan leaf
   (ran, token) <- focusTokenAt leaf
   -- if `focusTokenAt` succeed, we can safely assume we have shift the cursor correctly
   -- we do not need to recover the cursor state, even if the following computation failed
   liftMaybeM $ do
     -- only handle the leaf node with single column token
     guard $ srcSpanStartLine span == srcSpanEndLine span
-    splitResult <-  lift $ splitRangeByText token ran
-    mapM_ (combineNodeIds ran splitResult) $ Map.filterWithKey (\k _ -> k == SourceInfo) $ getSourcedNodeInfo $ sourcedNodeInfo leaf
+    splitResult <- lift $ splitRangeByText token ran
+    foldMapM (combineNodeIds lookupHsTokenType ran splitResult) $ Map.filterWithKey (\k _ -> k == SourceInfo) $ getSourcedNodeInfo $ sourcedNodeInfo leaf
   where
-    combineNodeIds :: (Monad m) => Range -> SplitResult -> NodeInfo a -> Tokenizer m ()
-    combineNodeIds ran ranSplit (NodeInfo _ _ bd) = mapM_ (getIdentifier ran ranSplit) (M.keys bd)
-    getIdentifier :: (Monad m) => Range -> SplitResult -> Identifier -> Tokenizer m ()
-    getIdentifier ran ranSplit idt = liftMaybeM $ do
+    combineNodeIds :: (Monad m) => HsSemanticLookup -> Range -> SplitResult -> NodeInfo a -> Tokenizer m (DList (Range, HsSemanticTokenType))
+    combineNodeIds lookupHsTokenType ran ranSplit (NodeInfo _ _ bd) =
+        case (maybeTokenType, ranSplit) of
+            (Nothing, _) -> return mempty
+            (Just TModule, _) -> return $ DL.singleton (ran, TModule)
+            (Just tokenType, NoSplit (_, tokenRan)) -> return $ DL.singleton (tokenRan, tokenType)
+            (Just tokenType, Split (_, ranPrefix, tokenRan)) -> return $ DL.fromList [(ranPrefix, TModule),(tokenRan, tokenType)]
+        where maybeTokenType = foldMap (getIdentifier lookupHsTokenType ranSplit) (M.keys bd)
+
+    getIdentifier :: HsSemanticLookup -> SplitResult -> Identifier -> Maybe HsSemanticTokenType
+    getIdentifier lookupHsTokenType ranSplit idt = do
       case idt of
-        Left _moduleName -> addRangeIdSetMap ran idt
+        Left _moduleName -> Just TModule
         Right name -> do
-          occStr <- lift $ T.pack <$> case (occNameString . nameOccName) name of
+          occStr <- T.pack <$> case (occNameString . nameOccName) name of
             -- the generated selector name with {-# LANGUAGE DuplicateRecordFields #-}
             '$' : 's' : 'e' : 'l' : ':' : xs -> Just $ takeWhile (/= ':') xs
             -- other generated names that should not be visible
             '$' : c : _ | isAlphaNum c       -> Nothing
             c : ':' : _ | isAlphaNum c       -> Nothing
             ns                               -> Just ns
-          case ranSplit of
-            (NoSplit (tk, r)) -> do
-              guard $ tk == occStr
-              addRangeIdSetMap r idt
-            (Split (tk, r1, r2)) -> do
-              guard $ tk == occStr
-              addRangeIdSetMap r1 (Left $ mkModuleName "")
-              addRangeIdSetMap r2 idt
+          guard $ getSplitTokenText ranSplit == occStr
+          lookupHsTokenType idt
+
 
 focusTokenAt ::
   -- | leaf node we want to focus on
diff --git a/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Types.hs b/plugins/hls-semantic-tokens-plugin/src/Ide/Plugin/SemanticTokens/Types.hs
@@ -11,7 +11,6 @@ import           Control.DeepSeq               (NFData (rnf), rwhnf)
 import qualified Data.Array                    as A
 import           Data.Default                  (Default (def))
 import           Data.Generics                 (Typeable)
-import qualified Data.Map.Strict               as M
 import           Development.IDE               (Pretty (pretty), RuleResult)
 import qualified Development.IDE.Core.Shake    as Shake
 import           Development.IDE.GHC.Compat    hiding (loc)
@@ -108,25 +107,28 @@ data Loc = Loc
 instance Show Loc where
   show (Loc line startChar len) = show line <> ":" <> show startChar <> "-" <> show (startChar + len)
 
-type RangeIdSetMap = Map Range (Set Identifier)
-
-type IdSemanticMap = Map Identifier HsSemanticTokenType
-
 data GetSemanticTokens = GetSemanticTokens
   deriving (Eq, Show, Typeable, Generic)
 
 instance Hashable GetSemanticTokens
 
 instance NFData GetSemanticTokens
 
-newtype RangeHsSemanticTokenTypes = RangeHsSemanticTokenTypes {rangeSemanticMap :: M.Map Range HsSemanticTokenType}
+type RangeSemanticTokenTypeList = [(Range, HsSemanticTokenType)]
+
+newtype RangeHsSemanticTokenTypes = RangeHsSemanticTokenTypes {rangeSemanticList :: RangeSemanticTokenTypeList}
 
 instance NFData RangeHsSemanticTokenTypes where
   rnf :: RangeHsSemanticTokenTypes -> ()
   rnf (RangeHsSemanticTokenTypes a) = rwhnf a
 
 instance Show RangeHsSemanticTokenTypes where
-  show = const "RangeHsSemanticTokenTypes"
+  show (RangeHsSemanticTokenTypes xs) = unlines $ map showRangeToken xs
+
+showRangeToken :: (Range, HsSemanticTokenType) -> String
+showRangeToken (ran, tk) = showRange ran <> " " <> show tk
+showRange :: Range -> String
+showRange (Range (Position l1 c1) (Position l2 c2)) = show l1 <> ":" <> show c1 <> "-" <> show l2 <> ":" <> show c2
 
 type instance RuleResult GetSemanticTokens = RangeHsSemanticTokenTypes