-- #hide
-----------------------------------------------------------------------------
-- |
-- Module : Language.Haskell.Lexer
-- Copyright : (c) The GHC Team, 1997-2000
-- License : BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer : libraries@haskell.org
-- Stability : experimental
-- Portability : portable
--
-- Lexer for Haskell.
--
-----------------------------------------------------------------------------
-- ToDo: Introduce different tokens for decimal, octal and hexadecimal (?)
-- ToDo: FloatTok should have three parts (integer part, fraction, exponent) (?)
-- ToDo: Use a lexical analyser generator (lx?)
module Language.Haskell.Lexer (Token(..), lexer) where
import Language.Haskell.ParseMonad
import Data.Char (isAlpha, isLower, isUpper, toLower,
isDigit, isHexDigit, isOctDigit, isSpace,
ord, chr, digitToInt)
import Data.Ratio
data Token
= VarId String
| QVarId (String,String)
| ConId String
| QConId (String,String)
| VarSym String
| ConSym String
| QVarSym (String,String)
| QConSym (String,String)
| IntTok Integer
| FloatTok Rational
| Character Char
| StringTok String
-- Symbols
| LeftParen
| RightParen
| SemiColon
| LeftCurly
| RightCurly
| VRightCurly -- a virtual close brace
| LeftSquare
| RightSquare
| Comma
| Underscore
| BackQuote
-- Reserved operators
| DotDot
| Colon
| DoubleColon
| Equals
| Backslash
| Bar
| LeftArrow
| RightArrow
| At
| Tilde
| DoubleArrow
| Minus
| Exclamation
-- Reserved Ids
| KW_Case
| KW_Class
| KW_Data
| KW_Default
| KW_Deriving
| KW_Do
| KW_Else
| KW_Foreign
| KW_If
| KW_Import
| KW_In
| KW_Infix
| KW_InfixL
| KW_InfixR
| KW_Instance
| KW_Let
| KW_Module
| KW_NewType
| KW_Of
| KW_Then
| KW_Type
| KW_Where
-- Special Ids
| KW_As
| KW_Export
| KW_Hiding
| KW_Qualified
| KW_Safe
| KW_Unsafe
| EOF
deriving (Eq,Show)
reserved_ops :: [(String,Token)]
reserved_ops = [
( "..", DotDot ),
( ":", Colon ),
( "::", DoubleColon ),
( "=", Equals ),
( "\\", Backslash ),
( "|", Bar ),
( "<-", LeftArrow ),
( "->", RightArrow ),
( "@", At ),
( "~", Tilde ),
( "=>", DoubleArrow )
]
special_varops :: [(String,Token)]
special_varops = [
( "-", Minus ), --ToDo: shouldn't be here
( "!", Exclamation ) --ditto
]
reserved_ids :: [(String,Token)]
reserved_ids = [
( "_", Underscore ),
( "case", KW_Case ),
( "class", KW_Class ),
( "data", KW_Data ),
( "default", KW_Default ),
( "deriving", KW_Deriving ),
( "do", KW_Do ),
( "else", KW_Else ),
( "foreign", KW_Foreign ),
( "if", KW_If ),
( "import", KW_Import ),
( "in", KW_In ),
( "infix", KW_Infix ),
( "infixl", KW_InfixL ),
( "infixr", KW_InfixR ),
( "instance", KW_Instance ),
( "let", KW_Let ),
( "module", KW_Module ),
( "newtype", KW_NewType ),
( "of", KW_Of ),
( "then", KW_Then ),
( "type", KW_Type ),
( "where", KW_Where )
]
special_varids :: [(String,Token)]
special_varids = [
( "as", KW_As ),
( "export", KW_Export ),
( "hiding", KW_Hiding ),
( "qualified", KW_Qualified ),
( "safe", KW_Safe ),
( "unsafe", KW_Unsafe )
]
isIdent, isSymbol :: Char -> Bool
isIdent c = isAlpha c || isDigit c || c == '\'' || c == '_'
isSymbol c = elem c ":!#$%&*+./<=>?@\\^|-~"
matchChar :: Char -> String -> Lex a ()
matchChar c msg = do
s <- getInput
if null s || head s /= c then fail msg else discard 1
-- The top-level lexer.
-- We need to know whether we are at the beginning of the line to decide
-- whether to insert layout tokens.
lexer :: (Token -> P a) -> P a
lexer = runL $ do
bol <- checkBOL
bol <- lexWhiteSpace bol
startToken
if bol then lexBOL else lexToken
lexWhiteSpace :: Bool -> Lex a Bool
lexWhiteSpace bol = do
s <- getInput
case s of
'{':'-':_ -> do
discard 2
bol <- lexNestedComment bol
lexWhiteSpace bol
'-':'-':rest | all (== '-') (takeWhile isSymbol rest) -> do
lexWhile (== '-')
lexWhile (/= '\n')
s' <- getInput
case s' of
[] -> fail "Unterminated end-of-line comment"
_ -> do
lexNewline
lexWhiteSpace True
'\n':_ -> do
lexNewline
lexWhiteSpace True
'\t':_ -> do
lexTab
lexWhiteSpace bol
c:_ | isSpace c -> do
discard 1
lexWhiteSpace bol
_ -> return bol
lexNestedComment :: Bool -> Lex a Bool
lexNestedComment bol = do
s <- getInput
case s of
'-':'}':_ -> discard 2 >> return bol
'{':'-':_ -> do
discard 2
bol <- lexNestedComment bol -- rest of the subcomment
lexNestedComment bol -- rest of this comment
'\t':_ -> lexTab >> lexNestedComment bol
'\n':_ -> lexNewline >> lexNestedComment True
_:_ -> discard 1 >> lexNestedComment bol
[] -> fail "Unterminated nested comment"
-- When we are lexing the first token of a line, check whether we need to
-- insert virtual semicolons or close braces due to layout.
lexBOL :: Lex a Token
lexBOL = do
pos <- getOffside
case pos of
LT -> do
-- trace "layout: inserting '}'\n" $
-- Set col to 0, indicating that we're still at the
-- beginning of the line, in case we need a semi-colon too.
-- Also pop the context here, so that we don't insert
-- another close brace before the parser can pop it.
setBOL
popContextL "lexBOL"
return VRightCurly
EQ ->
-- trace "layout: inserting ';'\n" $
return SemiColon
GT ->
lexToken
lexToken :: Lex a Token
lexToken = do
s <- getInput
case s of
[] -> return EOF
'0':c:d:_ | toLower c == 'o' && isOctDigit d -> do
discard 2
n <- lexOctal
return (IntTok n)
| toLower c == 'x' && isHexDigit d -> do
discard 2
n <- lexHexadecimal
return (IntTok n)
c:_ | isDigit c -> lexDecimalOrFloat
| isUpper c -> lexConIdOrQual ""
| isLower c || c == '_' -> do
ident <- lexWhile isIdent
return $ case lookup ident (reserved_ids ++ special_varids) of
Just keyword -> keyword
Nothing -> VarId ident
| isSymbol c -> do
sym <- lexWhile isSymbol
return $ case lookup sym (reserved_ops ++ special_varops) of
Just t -> t
Nothing -> case c of
':' -> ConSym sym
_ -> VarSym sym
| otherwise -> do
discard 1
case c of
-- First the special symbols
'(' -> return LeftParen
')' -> return RightParen
',' -> return Comma
';' -> return SemiColon
'[' -> return LeftSquare
']' -> return RightSquare
'`' -> return BackQuote
'{' -> do
pushContextL NoLayout
return LeftCurly
'}' -> do
popContextL "lexToken"
return RightCurly
'\'' -> do
c2 <- lexChar
matchChar '\'' "Improperly terminated character constant"
return (Character c2)
'"' -> lexString
_ -> fail ("Illegal character \'" ++ show c ++ "\'\n")
lexDecimalOrFloat :: Lex a Token
lexDecimalOrFloat = do
ds <- lexWhile isDigit
rest <- getInput
case rest of
('.':d:_) | isDigit d -> do
discard 1
frac <- lexWhile isDigit
let num = parseInteger 10 (ds ++ frac)
decimals = toInteger (length frac)
exponent <- do
rest2 <- getInput
case rest2 of
'e':_ -> lexExponent
'E':_ -> lexExponent
_ -> return 0
return (FloatTok ((num%1) * 10^^(exponent - decimals)))
e:_ | toLower e == 'e' -> do
exponent <- lexExponent
return (FloatTok ((parseInteger 10 ds%1) * 10^^exponent))
_ -> return (IntTok (parseInteger 10 ds))
where
lexExponent :: Lex a Integer
lexExponent = do
discard 1 -- 'e' or 'E'
r <- getInput
case r of
'+':d:_ | isDigit d -> do
discard 1
lexDecimal
'-':d:_ | isDigit d -> do
discard 1
n <- lexDecimal
return (negate n)
d:_ | isDigit d -> lexDecimal
_ -> fail "Float with missing exponent"
lexConIdOrQual :: String -> Lex a Token
lexConIdOrQual qual = do
con <- lexWhile isIdent
let conid | null qual = ConId con
| otherwise = QConId (qual,con)
qual' | null qual = con
| otherwise = qual ++ '.':con
just_a_conid <- alternative (return conid)
rest <- getInput
case rest of
'.':c:_
| isLower c || c == '_' -> do -- qualified varid?
discard 1
ident <- lexWhile isIdent
case lookup ident reserved_ids of
-- cannot qualify a reserved word
Just _ -> just_a_conid
Nothing -> return (QVarId (qual', ident))
| isUpper c -> do -- qualified conid?
discard 1
lexConIdOrQual qual'
| isSymbol c -> do -- qualified symbol?
discard 1
sym <- lexWhile isSymbol
case lookup sym reserved_ops of
-- cannot qualify a reserved operator
Just _ -> just_a_conid
Nothing -> return $ case c of
':' -> QConSym (qual', sym)
_ -> QVarSym (qual', sym)
_ -> return conid -- not a qualified thing
lexChar :: Lex a Char
lexChar = do
r <- getInput
case r of
'\\':_ -> lexEscape
c:_ -> discard 1 >> return c
[] -> fail "Incomplete character constant"
lexString :: Lex a Token
lexString = loop ""
where
loop s = do
r <- getInput
case r of
'\\':'&':_ -> do
discard 2
loop s
'\\':c:_ | isSpace c -> do
discard 1
lexWhiteChars
matchChar '\\' "Illegal character in string gap"
loop s
| otherwise -> do
ce <- lexEscape
loop (ce:s)
'"':_ -> do
discard 1
return (StringTok (reverse s))
c:_ -> do
discard 1
loop (c:s)
[] -> fail "Improperly terminated string"
lexWhiteChars :: Lex a ()
lexWhiteChars = do
s <- getInput
case s of
'\n':_ -> do
lexNewline
lexWhiteChars
'\t':_ -> do
lexTab
lexWhiteChars
c:_ | isSpace c -> do
discard 1
lexWhiteChars
_ -> return ()
lexEscape :: Lex a Char
lexEscape = do
discard 1
r <- getInput
case r of
-- Production charesc from section B.2 (Note: \& is handled by caller)
'a':_ -> discard 1 >> return '\a'
'b':_ -> discard 1 >> return '\b'
'f':_ -> discard 1 >> return '\f'
'n':_ -> discard 1 >> return '\n'
'r':_ -> discard 1 >> return '\r'
't':_ -> discard 1 >> return '\t'
'v':_ -> discard 1 >> return '\v'
'\\':_ -> discard 1 >> return '\\'
'"':_ -> discard 1 >> return '\"'
'\'':_ -> discard 1 >> return '\''
-- Production ascii from section B.2
'^':c:_ -> discard 2 >> cntrl c
'N':'U':'L':_ -> discard 3 >> return '\NUL'
'S':'O':'H':_ -> discard 3 >> return '\SOH'
'S':'T':'X':_ -> discard 3 >> return '\STX'
'E':'T':'X':_ -> discard 3 >> return '\ETX'
'E':'O':'T':_ -> discard 3 >> return '\EOT'
'E':'N':'Q':_ -> discard 3 >> return '\ENQ'
'A':'C':'K':_ -> discard 3 >> return '\ACK'
'B':'E':'L':_ -> discard 3 >> return '\BEL'
'B':'S':_ -> discard 2 >> return '\BS'
'H':'T':_ -> discard 2 >> return '\HT'
'L':'F':_ -> discard 2 >> return '\LF'
'V':'T':_ -> discard 2 >> return '\VT'
'F':'F':_ -> discard 2 >> return '\FF'
'C':'R':_ -> discard 2 >> return '\CR'
'S':'O':_ -> discard 2 >> return '\SO'
'S':'I':_ -> discard 2 >> return '\SI'
'D':'L':'E':_ -> discard 3 >> return '\DLE'
'D':'C':'1':_ -> discard 3 >> return '\DC1'
'D':'C':'2':_ -> discard 3 >> return '\DC2'
'D':'C':'3':_ -> discard 3 >> return '\DC3'
'D':'C':'4':_ -> discard 3 >> return '\DC4'
'N':'A':'K':_ -> discard 3 >> return '\NAK'
'S':'Y':'N':_ -> discard 3 >> return '\SYN'
'E':'T':'B':_ -> discard 3 >> return '\ETB'
'C':'A':'N':_ -> discard 3 >> return '\CAN'
'E':'M':_ -> discard 2 >> return '\EM'
'S':'U':'B':_ -> discard 3 >> return '\SUB'
'E':'S':'C':_ -> discard 3 >> return '\ESC'
'F':'S':_ -> discard 2 >> return '\FS'
'G':'S':_ -> discard 2 >> return '\GS'
'R':'S':_ -> discard 2 >> return '\RS'
'U':'S':_ -> discard 2 >> return '\US'
'S':'P':_ -> discard 2 >> return '\SP'
'D':'E':'L':_ -> discard 3 >> return '\DEL'
-- Escaped numbers
'o':c:_ | isOctDigit c -> do
discard 1
n <- lexOctal
checkChar n
'x':c:_ | isHexDigit c -> do
discard 1
n <- lexHexadecimal
checkChar n
c:_ | isDigit c -> do
n <- lexDecimal
checkChar n
_ -> fail "Illegal escape sequence"
where
checkChar n | n <= 0x01FFFF = return (chr (fromInteger n))
checkChar _ = fail "Character constant out of range"
-- Production cntrl from section B.2
cntrl :: Char -> Lex a Char
cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'))
cntrl _ = fail "Illegal control character"
-- assumes at least one octal digit
lexOctal :: Lex a Integer
lexOctal = do
ds <- lexWhile isOctDigit
return (parseInteger 8 ds)
-- assumes at least one hexadecimal digit
lexHexadecimal :: Lex a Integer
lexHexadecimal = do
ds <- lexWhile isHexDigit
return (parseInteger 16 ds)
-- assumes at least one decimal digit
lexDecimal :: Lex a Integer
lexDecimal = do
ds <- lexWhile isDigit
return (parseInteger 10 ds)
-- Stolen from Hugs's Prelude
parseInteger :: Integer -> String -> Integer
parseInteger radix ds =
foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds)
|