Skip to content

Migrate VFS to Data.Text.Utf16.Rope.Mixed #436

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cabal.project
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ tests: True
benchmarks: True
test-show-details: direct
haddock-quickjump: True

source-repository-package
type: git
location: https://github.com/Bodigrim/text-rope
4 changes: 2 additions & 2 deletions lsp/lsp.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ library
, scientific
, temporary
, text
, text-rope
, text-rope >= 0.2
, transformers >= 0.5.6 && < 0.6
, time
, unordered-containers
Expand Down Expand Up @@ -131,7 +131,7 @@ test-suite unit-test
, quickcheck-instances
, sorted-list == 0.2.1.*
, text
, text-rope
, text-rope >= 0.2
, unordered-containers
-- For GHCI tests
-- , async
Expand Down
110 changes: 25 additions & 85 deletions lsp/src/Language/LSP/VFS.hs
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ import Data.Ord
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Map.Strict as Map
import Data.Maybe
import qualified Data.Text.Rope as URope
import Data.Text.Utf16.Rope ( Rope )
import qualified Data.Text.Utf16.Rope as Rope
import Data.Text.Lines as Char ( Position(..) )
import Data.Text.Utf16.Lines as Utf16 ( Position(..) )
import Data.Text.Utf16.Rope.Mixed ( Rope )
import qualified Data.Text.Utf16.Rope.Mixed as Rope
import Data.Text.Prettyprint.Doc hiding (line)
import qualified Language.LSP.Types as J
import qualified Language.LSP.Types.Lens as J
Expand Down Expand Up @@ -112,7 +113,7 @@ data VFS = VFS { _vfsMap :: !(Map.Map J.NormalizedUri VirtualFile)
} deriving Show

data VfsLog =
SplitInsideCodePoint Rope.Position Rope
SplitInsideCodePoint Utf16.Position Rope
| URINotFound J.NormalizedUri
| Opening J.NormalizedUri
| Closing J.NormalizedUri
Expand Down Expand Up @@ -342,18 +343,18 @@ applyChange :: (Monad m) => LogAction m (WithSeverity VfsLog) -> Rope -> J.TextD
applyChange _ _ (J.TextDocumentContentChangeEvent Nothing _ str)
= pure $ Rope.fromText str
applyChange logger str (J.TextDocumentContentChangeEvent (Just (J.Range (J.Position sl sc) (J.Position fl fc))) _ txt)
= changeChars logger str (Rope.Position (fromIntegral sl) (fromIntegral sc)) (Rope.Position (fromIntegral fl) (fromIntegral fc)) txt
= changeChars logger str (Utf16.Position (fromIntegral sl) (fromIntegral sc)) (Utf16.Position (fromIntegral fl) (fromIntegral fc)) txt

-- ---------------------------------------------------------------------

-- | Given a 'Rope', start and end positions, and some new text, replace
-- the given range with the new text. If the given positions lie within
-- a code point then this does nothing (returns the original 'Rope') and logs.
changeChars :: (Monad m) => LogAction m (WithSeverity VfsLog) -> Rope -> Rope.Position -> Rope.Position -> Text -> m Rope
changeChars :: (Monad m) => LogAction m (WithSeverity VfsLog) -> Rope -> Utf16.Position -> Utf16.Position -> Text -> m Rope
changeChars logger str start finish new = do
case Rope.splitAtPosition finish str of
case Rope.utf16SplitAtPosition finish str of
Nothing -> logger <& SplitInsideCodePoint finish str `WithSeverity` Warning >> pure str
Just (before, after) -> case Rope.splitAtPosition start before of
Just (before, after) -> case Rope.utf16SplitAtPosition start before of
Nothing -> logger <& SplitInsideCodePoint start before `WithSeverity` Warning >> pure str
Just (before', _) -> pure $ mconcat [before', Rope.fromText new, after]

Expand All @@ -380,103 +381,42 @@ data CodePointRange =
makeFieldsNoPrefix ''CodePointPosition
makeFieldsNoPrefix ''CodePointRange

{- Note [Converting between code points and code units]
This is inherently a somewhat expensive operation, but we take some care to minimize the cost.
In particular, we use the good asymptotics of 'Rope' to our advantage:
- We extract the single line that we are interested in in time logarithmic in the number of lines.
- We then split the line at the given position, and check how long the prefix is, which takes
linear time in the length of the (single) line.

We also may need to convert the line back and forth between ropes with different indexing. Again
this is linear time in the length of the line.

So the overall process is logarithmic in the number of lines, and linear in the length of the specific
line. Which is okay-ish, so long as we don't have very long lines.
-}

-- | Extracts a specific line from a 'Rope.Rope'.
-- Logarithmic in the number of lines.
extractLine :: Rope.Rope -> Word -> Maybe Rope.Rope
extractLine rope l = do
-- Check for the line being out of bounds
let lastLine = Rope.posLine $ Rope.lengthAsPosition rope
guard $ l <= lastLine

let (_, suffix) = Rope.splitAtLine l rope
(prefix, _) = Rope.splitAtLine 1 suffix
pure prefix

-- | Translate a code-point offset into a code-unit offset.
-- Linear in the length of the rope.
codePointOffsetToCodeUnitOffset :: URope.Rope -> Word -> Maybe Word
codePointOffsetToCodeUnitOffset rope offset = do
-- Check for the position being out of bounds
guard $ offset <= URope.length rope
-- Split at the given position in *code points*
let (prefix, _) = URope.splitAt offset rope
-- Convert the prefix to a rope using *code units*
utf16Prefix = Rope.fromText $ URope.toText prefix
-- Get the length of the prefix in *code units*
pure $ Rope.length utf16Prefix

-- | Translate a UTF-16 code-unit offset into a code-point offset.
-- Linear in the length of the rope.
codeUnitOffsetToCodePointOffset :: Rope.Rope -> Word -> Maybe Word
codeUnitOffsetToCodePointOffset rope offset = do
-- Check for the position being out of bounds
guard $ offset <= Rope.length rope
-- Split at the given position in *code units*
(prefix, _) <- Rope.splitAt offset rope
-- Convert the prefix to a rope using *code points*
let utfPrefix = URope.fromText $ Rope.toText prefix
-- Get the length of the prefix in *code points*
pure $ URope.length utfPrefix

-- | Given a virtual file, translate a 'CodePointPosition' in that file into a 'J.Position' in that file.
--
-- Will return 'Nothing' if the requested position is out of bounds of the document.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens now? I'm guessing we get the "clamped" result. I think both behaviours are reasonable but I'd like it to be documented.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's clamped.

--
-- Logarithmic in the number of lines in the document, and linear in the length of the line containing
-- the position.
codePointPositionToPosition :: VirtualFile -> CodePointPosition -> Maybe J.Position
codePointPositionToPosition vFile (CodePointPosition l cpc) = do
-- See Note [Converting between code points and code units]
let text = _file_text vFile
utf16Line <- extractLine text (fromIntegral l)
-- Convert the line a rope using *code points*
let utfLine = URope.fromText $ Rope.toText utf16Line

cuc <- codePointOffsetToCodeUnitOffset utfLine (fromIntegral cpc)
pure $ J.Position l (fromIntegral cuc)
codePointPositionToPosition :: VirtualFile -> CodePointPosition -> J.Position
codePointPositionToPosition vFile (CodePointPosition cpl cpc) =
J.Position (fromIntegral cul) (fromIntegral cuc)
where
text = _file_text vFile
(prefix, _) = Rope.charSplitAtPosition (Char.Position (fromIntegral cpl) (fromIntegral cpc)) text
Utf16.Position cul cuc = Rope.utf16LengthAsPosition prefix

-- | Given a virtual file, translate a 'CodePointRange' in that file into a 'J.Range' in that file.
--
-- Will return 'Nothing' if any of the positions are out of bounds of the document.
--
-- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
-- the positions.
codePointRangeToRange :: VirtualFile -> CodePointRange -> Maybe J.Range
codePointRangeToRange :: VirtualFile -> CodePointRange -> J.Range
codePointRangeToRange vFile (CodePointRange b e) =
J.Range <$> codePointPositionToPosition vFile b <*> codePointPositionToPosition vFile e
J.Range (codePointPositionToPosition vFile b) (codePointPositionToPosition vFile e)

-- | Given a virtual file, translate a 'J.Position' in that file into a 'CodePointPosition' in that file.
--
-- Will return 'Nothing' if the requested position lies inside a code point, or if it is out of bounds of the document.
-- Will return 'Nothing' if the requested position lies inside a code point.
--
-- Logarithmic in the number of lines in the document, and linear in the length of the line containing
-- the position.
positionToCodePointPosition :: VirtualFile -> J.Position -> Maybe CodePointPosition
positionToCodePointPosition vFile (J.Position l cuc) = do
-- See Note [Converting between code points and code units]
positionToCodePointPosition vFile (J.Position cul cuc) = do
let text = _file_text vFile
utf16Line <- extractLine text (fromIntegral l)

cpc <- codeUnitOffsetToCodePointOffset utf16Line (fromIntegral cuc)
pure $ CodePointPosition l (fromIntegral cpc)
(prefix, _) <- Rope.utf16SplitAtPosition (Utf16.Position (fromIntegral cul) (fromIntegral cuc)) text
let Char.Position cpl cpc = Rope.charLengthAsPosition prefix
pure $ CodePointPosition (fromIntegral cpl) (fromIntegral cpc)

-- | Given a virtual file, translate a 'J.Range' in that file into a 'CodePointRange' in that file.
--
-- Will return 'Nothing' if any of the positions are out of bounds of the document.
-- Will return 'Nothing' if the requested position lies inside a code point.
--
-- Logarithmic in the number of lines in the document, and linear in the length of the lines containing
-- the positions.
Expand Down Expand Up @@ -512,7 +452,7 @@ getCompletionPrefix pos@(J.Position l c) (VirtualFile _ _ ropetext) =
lastMaybe xs = Just $ last xs

let curRope = fst $ Rope.splitAtLine 1 $ snd $ Rope.splitAtLine (fromIntegral l) ropetext
beforePos <- Rope.toText . fst <$> Rope.splitAt (fromIntegral c) curRope
beforePos <- Rope.toText . fst <$> Rope.utf16SplitAt (fromIntegral c) curRope
curWord <-
if | T.null beforePos -> Just ""
| T.last beforePos == ' ' -> Just "" -- don't count abc as the curword in 'abc '
Expand Down
26 changes: 13 additions & 13 deletions lsp/test/VspSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
module VspSpec where

import Data.String
import qualified Data.Text.Utf16.Rope as Rope
import qualified Data.Text.Utf16.Rope.Mixed as Rope
import Language.LSP.VFS
import qualified Language.LSP.Types as J
import qualified Data.Text as T
Expand Down Expand Up @@ -313,12 +313,12 @@ vspSpec = do
positionToCodePointPosition vfile (J.Position 1 2) `shouldBe` Nothing
positionToCodePointPosition vfile (J.Position 1 3) `shouldBe` Just (CodePointPosition 1 2)
positionToCodePointPosition vfile (J.Position 1 4) `shouldBe` Just (CodePointPosition 1 3)
positionToCodePointPosition vfile (J.Position 1 5) `shouldBe` Just (CodePointPosition 1 4)
positionToCodePointPosition vfile (J.Position 1 5) `shouldBe` Just (CodePointPosition 2 0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one looks like an actual logic change 🤔 What happened here? Was the old implementation wrong?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tend to think that the previous implementation was not quite correct.

vfile here contains a𐐀b\na𐐀b\n. If you take 1 line (which is a𐐀b\n) and then 4 code points, you consume the last newline (as a codepoint) as well and now your position is (2, 0).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, there's an interesting question here, namely: if you have a position (x,y) where y is greater than the number of characters in the line, does that represent a position on the next line? I would say no: the first coordinate of the position limits you to that line, and the most you can do is go to the end of it. Which I would represent as (x, maxCol+1), although arguably that's a bit weird.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imagine that you pressed → key x times and then → key y times. I believe in the majority of modern editors this moves your cursor to the next line x+1. Such convention also allows to make splitAtPosition total, which is a nice property.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the more realistic example in our case where we can end up with positions out of range is the following:

  • HLS has stale file contents where line 2 only has 30 characters
  • The user has locally edited line 2 so they have more than 30 characters, and so their editor sends us positions with offsets over 30

It would be more wrong in general I believe to just start wrapping onto the next line. Keeping things strictly line-based is better here. That's why I initially was returning Nothing for positions beyond the end of the line!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, but IMHO once VFS is out of sync all bets are off anyways. I think positionToCodePointPosition is too low-level to be bothered with this kind of sanity checks, it does not edit anything per se, but I can try reverting to the previous design.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the VFS being slightly out of sync is actually the normal state when the user is editing the file!

I wish I had some clear design to point you to (the LSP spec sadly provides no guidance)... I think the clamping part is fine, but I do think we should try and keep things to the same line and use the position "just past" the end of the line when we go off the end.

-- Greater column than max column
positionToCodePointPosition vfile (J.Position 1 6) `shouldBe` Nothing
positionToCodePointPosition vfile (J.Position 2 1) `shouldBe` Nothing
positionToCodePointPosition vfile (J.Position 1 6) `shouldBe` Just (CodePointPosition 2 0)
positionToCodePointPosition vfile (J.Position 2 1) `shouldBe` Just (CodePointPosition 2 0)
-- Greater line than max line
positionToCodePointPosition vfile (J.Position 3 0) `shouldBe` Nothing
positionToCodePointPosition vfile (J.Position 3 0) `shouldBe` Just (CodePointPosition 2 0)

it "converts code points to code units" $ do
let
Expand All @@ -328,16 +328,16 @@ vspSpec = do
]
vfile = VirtualFile 0 0 (fromString orig)

codePointPositionToPosition vfile (CodePointPosition 1 0) `shouldBe` Just (J.Position 1 0)
codePointPositionToPosition vfile (CodePointPosition 1 1) `shouldBe` Just (J.Position 1 1)
codePointPositionToPosition vfile (CodePointPosition 1 2) `shouldBe` Just (J.Position 1 3)
codePointPositionToPosition vfile (CodePointPosition 1 3) `shouldBe` Just (J.Position 1 4)
codePointPositionToPosition vfile (CodePointPosition 1 4) `shouldBe` Just (J.Position 1 5)
codePointPositionToPosition vfile (CodePointPosition 1 0) `shouldBe` J.Position 1 0
codePointPositionToPosition vfile (CodePointPosition 1 1) `shouldBe` J.Position 1 1
codePointPositionToPosition vfile (CodePointPosition 1 2) `shouldBe` J.Position 1 3
codePointPositionToPosition vfile (CodePointPosition 1 3) `shouldBe` J.Position 1 4
codePointPositionToPosition vfile (CodePointPosition 1 4) `shouldBe` J.Position 2 0
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

-- Greater column than max column
codePointPositionToPosition vfile (CodePointPosition 1 5) `shouldBe` Nothing
codePointPositionToPosition vfile (CodePointPosition 2 1) `shouldBe` Nothing
codePointPositionToPosition vfile (CodePointPosition 1 5) `shouldBe` J.Position 2 0
codePointPositionToPosition vfile (CodePointPosition 2 1) `shouldBe` J.Position 2 0
-- Greater line than max line
codePointPositionToPosition vfile (CodePointPosition 3 0) `shouldBe` Nothing
codePointPositionToPosition vfile (CodePointPosition 3 0) `shouldBe` J.Position 2 0

-- ---------------------------------

Expand Down
2 changes: 1 addition & 1 deletion stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ extra-package-dbs: []
nix:
packages: [icu]
extra-deps:
- text-rope-0.1
- text-rope-0.2
- co-log-core-0.3.1.0