Skip to content

Commit 01d305e

Browse files
committed
Add functions to convert between code-point and code-unit positions
LSP `Position`s use UTF-16 code units for offsets within lines; most other sane tools (like GHC) use Unicode code points. We need to use the right one in the right place, otherwise we get issues like haskell/haskell-language-server#2646. This is pretty unpleasant, since code points are variable-size, so you can't do the conversion without having the file text itself. This PR provides a type for positions using code points (for clients to use to help them be less confused) and functions for using the VFS to convert between those and LSP positions.
1 parent 8b63438 commit 01d305e

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

lsp-types/src/Language/LSP/Types/Location.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import Language.LSP.Types.Utils
1111

1212
-- ---------------------------------------------------------------------
1313

14+
-- | A position in a document. Note that the character offsets in a line
15+
-- are given in UTF-16 code units, *not* Unicode code points.
1416
data Position =
1517
Position
1618
{ -- | Line position in a document (zero-based).

lsp/src/Language/LSP/VFS.hs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ module Language.LSP.VFS
4444
, persistFileVFS
4545
, closeVFS
4646

47+
-- * Positions and transformations
48+
, CodePointPosition (..)
49+
, codePointPositionToPosition
50+
, positionToCodePointPosition
51+
4752
-- * manipulating the file contents
4853
, rangeLinesFromVfs
4954
, PosPrefixInfo(..)
@@ -69,6 +74,7 @@ import Data.Ord
6974
import qualified Data.HashMap.Strict as HashMap
7075
import qualified Data.Map.Strict as Map
7176
import Data.Maybe
77+
import qualified Data.Text.Rope as URope
7278
import Data.Text.Utf16.Rope ( Rope )
7379
import qualified Data.Text.Utf16.Rope as Rope
7480
import Data.Text.Prettyprint.Doc
@@ -346,6 +352,49 @@ changeChars logger str start finish new = do
346352

347353
-- ---------------------------------------------------------------------
348354

355+
-- | A position, like a 'J.Position', but where the offsets in the line are measured in Unicode code points
356+
-- instead of UTF-16 code units.
357+
data CodePointPosition =
358+
CodePointPosition
359+
{ -- | Line position in a document (zero-based).
360+
_line :: J.UInt
361+
-- | Character offset on a line in a document in *code points* (zero-based).
362+
, _character :: J.UInt
363+
} deriving (Show, Read, Eq, Ord)
364+
365+
-- | Given a virtual file, translate a 'CodePointPosition' in that file into a 'J.Position' in that file.
366+
--
367+
-- We need the file itself because this requires translating between code points and code units.
368+
codePointPositionToPosition :: VirtualFile -> CodePointPosition -> J.Position
369+
codePointPositionToPosition vFile (CodePointPosition cpl cpc) =
370+
let utf16Text = _file_text vFile
371+
-- Transcode to a code-point based rope
372+
utfText = URope.fromText $ Rope.toText utf16Text
373+
-- Split at the given position
374+
(utfPrefix, _) = URope.splitAtPosition (URope.Position (fromIntegral cpl) (fromIntegral cpc)) utfText
375+
-- Transcode the prefix to a code-unit based rope
376+
utf16Prefix = Rope.fromText $ URope.toText utfPrefix
377+
-- Get the length of the transcoded prefix
378+
(Rope.Position cul cuc) = Rope.lengthAsPosition utf16Prefix
379+
in J.Position (fromIntegral cul) (fromIntegral cuc)
380+
381+
-- | Given a virtual file, translate a 'J.Position' in that file into a 'CodePointPosition' in that file.
382+
-- May fail if the requested position lies inside a code point.
383+
--
384+
-- We need the file itself because this requires translating between code unit and code points.
385+
positionToCodePointPosition :: VirtualFile -> J.Position -> Maybe CodePointPosition
386+
positionToCodePointPosition vFile (J.Position cul cuc) = do
387+
let utf16Text = _file_text vFile
388+
-- Split at the given location
389+
(utf16Prefix, _) <- Rope.splitAtPosition (Rope.Position (fromIntegral cul) (fromIntegral cuc)) utf16Text
390+
-- Transcode the preix to a code-point based rope
391+
let utfPrefix = URope.fromText $ Rope.toText utf16Prefix
392+
-- Get the length of the transcoded prefix
393+
(URope.Position cpl cpc) = URope.lengthAsPosition utfPrefix
394+
pure $ CodePointPosition (fromIntegral cpl) (fromIntegral cpc)
395+
396+
-- ---------------------------------------------------------------------
397+
349398
-- TODO:AZ:move this to somewhere sane
350399
-- | Describes the line at the current cursor position
351400
data PosPrefixInfo = PosPrefixInfo

0 commit comments

Comments
 (0)