-----------------------------------------------------------------------------
-- |
-- Module : Tokenise
-- Copyright : 2004 Malcolm Wallace
-- Licence : LGPL
--
-- Maintainer : Malcolm Wallace <Malcolm.Wallace@cs.york.ac.uk>
-- Stability : experimental
-- Portability : All
--
-- The purpose of this module is to lex a source file (language
-- unspecified) into tokens such that cpp can recognise a replaceable
-- symbol or macro-use, and do the right thing.
-----------------------------------------------------------------------------
module Language.Preprocessor.Cpphs.Tokenise
( linesCpp
, reslash
, tokenise
, WordStyle(..)
, deWordStyle
, parseMacroCall
) where
import Char
import Language.Preprocessor.Cpphs.HashDefine
import Language.Preprocessor.Cpphs.Position
-- | A Mode value describes whether to tokenise a la Haskell, or a la Cpp.
-- The main difference is that in Cpp mode we should recognise line
-- continuation characters.
data Mode = Haskell | Cpp
-- | linesCpp is, broadly speaking, Prelude.lines, except that
-- on a line beginning with a \#, line continuation characters are
-- recognised. In a line continuation, the newline character is
-- preserved, but the backslash is not.
linesCpp :: String -> [String]
linesCpp [] = []
linesCpp (x:xs) | x=='#' = tok Cpp ['#'] xs
| otherwise = tok Haskell [] (x:xs)
where
tok Cpp acc ('\\':'\n':ys) = tok Cpp ('\n':acc) ys
tok _ acc ('\n':'#':ys) = reverse acc: tok Cpp ['#'] ys
tok _ acc ('\n':ys) = reverse acc: tok Haskell [] ys
tok _ acc [] = reverse acc: []
tok mode acc (y:ys) = tok mode (y:acc) ys
-- | Put back the line-continuation characters.
reslash :: String -> String
reslash ('\n':xs) = '\\':'\n':reslash xs
reslash (x:xs) = x: reslash xs
reslash [] = []
----
-- | Submodes are required to deal correctly with nesting of lexical
-- structures.
data SubMode = Any | Pred (Char->Bool) (Posn->String->WordStyle)
| String Char | LineComment | NestComment Int
| CComment | CLineComment
-- | Each token is classified as one of Ident, Other, or Cmd:
-- * Ident is a word that could potentially match a macro name.
-- * Cmd is a complete cpp directive (\#define etc).
-- * Other is anything else.
data WordStyle = Ident Posn String | Other String | Cmd (Maybe HashDefine)
deriving (Eq,Show)
other :: Posn -> String -> WordStyle
other _ s = Other s
deWordStyle :: WordStyle -> String
deWordStyle (Ident _ i) = i
deWordStyle (Other i) = i
deWordStyle (Cmd _) = "\n"
-- | tokenise is, broadly-speaking, Prelude.words, except that:
-- * the input is already divided into lines
-- * each word-like "token" is categorised as one of {Ident,Other,Cmd}
-- * \#define's are parsed and returned out-of-band using the Cmd variant
-- * All whitespace is preserved intact as tokens.
-- * C-comments are converted to white-space (depending on first param)
-- * Parens and commas are tokens in their own right.
-- * Any cpp line continuations are respected.
-- No errors can be raised.
-- The inverse of tokenise is (concatMap deWordStyle).
tokenise :: Bool -> Bool -> Bool -> Bool -> [(Posn,String)] -> [WordStyle]
tokenise _ _ _ _ [] = []
tokenise stripEol stripComments ansi lang ((pos,str):pos_strs) =
(if lang then haskell else plaintext) Any [] pos pos_strs str
where
-- rules to lex Haskell
haskell :: SubMode -> String -> Posn -> [(Posn,String)]
-> String -> [WordStyle]
haskell Any acc p ls ('\n':'#':xs) = emit acc $ -- emit "\n" $
cpp Any haskell [] [] p ls xs
-- warning: non-maximal munch on comment
haskell Any acc p ls ('-':'-':xs) = emit acc $
haskell LineComment "--" p ls xs
haskell Any acc p ls ('{':'-':xs) = emit acc $
haskell (NestComment 0) "-{" p ls xs
haskell Any acc p ls ('/':'*':xs)
| stripComments = emit acc $
haskell CComment " " p ls xs
haskell Any acc p ls ('/':'/':xs)
| stripEol = emit acc $
haskell CLineComment " " p ls xs
haskell Any acc p ls ('"':xs) = emit acc $
haskell (String '"') ['"'] p ls xs
haskell Any acc p ls ('\'':xs) = emit acc $
haskell (String '\'') "'" p ls xs
haskell Any acc p ls (x:xs) | single x = emit acc $ emit [x] $
haskell Any [] p ls xs
haskell Any acc p ls (x:xs) | space x = emit acc $
haskell (Pred space other) [x]
p ls xs
haskell Any acc p ls (x:xs) | symbol x = emit acc $
haskell (Pred symbol other) [x]
p ls xs
-- haskell Any [] p ls (x:xs) | ident0 x = id $
haskell Any acc p ls (x:xs) | ident0 x = emit acc $
haskell (Pred ident1 Ident) [x]
p ls xs
haskell Any acc p ls (x:xs) = haskell Any (x:acc) p ls xs
haskell pre@(Pred pred ws) acc p ls (x:xs)
| pred x = haskell pre (x:acc) p ls xs
haskell (Pred _ ws) acc p ls xs = ws p (reverse acc):
haskell Any [] p ls xs
haskell (String c) acc p ls ('\\':x:xs)
| x=='\\' = haskell (String c) ('\\':'\\':acc) p ls xs
| x==c = haskell (String c) (c:'\\':acc) p ls xs
haskell (String c) acc p ls (x:xs)
| x==c = emit (c:acc) $ haskell Any [] p ls xs
| otherwise = haskell (String c) (x:acc) p ls xs
haskell LineComment acc p ls xs@('\n':_) = emit acc $ haskell Any [] p ls xs
haskell LineComment acc p ls (x:xs) = haskell LineComment (x:acc) p ls xs
haskell (NestComment n) acc p ls ('{':'-':xs)
= haskell (NestComment (n+1))
("-{"++acc) p ls xs
haskell (NestComment 0) acc p ls ('-':'}':xs)
= emit ("}-"++acc) $ haskell Any [] p ls xs
haskell (NestComment n) acc p ls ('-':'}':xs)
= haskell (NestComment (n-1))
("}-"++acc) p ls xs
haskell (NestComment n) acc p ls (x:xs) = haskell (NestComment n) (x:acc)
p ls xs
haskell CComment acc p ls ('*':'/':xs) = emit (" "++acc) $
haskell Any [] p ls xs
haskell CComment acc p ls (_:xs) = haskell CComment (' ':acc) p ls xs
haskell CLineComment acc p ls xs@('\n':_)= emit acc $ haskell Any [] p ls xs
haskell CLineComment acc p ls (_:xs) = haskell CLineComment (' ':acc)
p ls xs
haskell mode acc _ ((p,l):ls) [] = haskell mode acc p ls ('\n':l)
haskell _ acc _ [] [] = emit acc $ []
-- rules to lex Cpp
cpp :: SubMode -> (SubMode -> String -> Posn -> [(Posn,String)]
-> String -> [WordStyle])
-> String -> [String] -> Posn -> [(Posn,String)]
-> String -> [WordStyle]
cpp mode next word line pos remaining input =
lexcpp mode word line remaining input
where
lexcpp Any w l ls ('/':'*':xs) = lexcpp (NestComment 0) "" (w*/*l) ls xs
lexcpp Any w l ls ('/':'/':xs) = lexcpp LineComment " " (w*/*l) ls xs
lexcpp Any w l ((p,l'):ls) ('\\':[]) = cpp Any next [] ("\n":w*/*l) p ls l'
lexcpp Any w l ls ('\\':'\n':xs) = lexcpp Any [] ("\n":w*/*l) ls xs
lexcpp Any w l ls xs@('\n':_) = Cmd (parseHashDefine ansi
(reverse (w*/*l))):
next Any [] pos ls xs
-- lexcpp Any w l ls ('"':xs) = lexcpp (String '"') ['"'] (w*/*l) ls xs
-- lexcpp Any w l ls ('\'':xs) = lexcpp (String '\'') "'" (w*/*l) ls xs
lexcpp Any w l ls ('"':xs) = lexcpp Any [] ("\"":(w*/*l)) ls xs
lexcpp Any w l ls ('\'':xs) = lexcpp Any [] ("'": (w*/*l)) ls xs
lexcpp Any [] l ls (x:xs)
| ident0 x = lexcpp (Pred ident1 Ident) [x] l ls xs
-- lexcpp Any w l ls (x:xs) | ident0 x = lexcpp (Pred ident1 Ident) [x] (w*/*l) ls xs
lexcpp Any w l ls (x:xs)
| single x = lexcpp Any [] ([x]:w*/*l) ls xs
| space x = lexcpp (Pred space other) [x] (w*/*l) ls xs
| symbol x = lexcpp (Pred symbol other) [x] (w*/*l) ls xs
| otherwise = lexcpp Any (x:w) l ls xs
lexcpp pre@(Pred pred _) w l ls (x:xs)
| pred x = lexcpp pre (x:w) l ls xs
lexcpp (Pred _ _) w l ls xs = lexcpp Any [] (w*/*l) ls xs
lexcpp (String c) w l ls ('\\':x:xs)
| x=='\\' = lexcpp (String c) ('\\':'\\':w) l ls xs
| x==c = lexcpp (String c) (c:'\\':w) l ls xs
lexcpp (String c) w l ls (x:xs)
| x==c = lexcpp Any [] ((c:w)*/*l) ls xs
| otherwise = lexcpp (String c) (x:w) l ls xs
lexcpp LineComment w l ((p,l'):ls) ('\\':[])
= cpp LineComment next [] (('\n':w)*/*l) pos ls l'
lexcpp LineComment w l ls ('\\':'\n':xs)
= lexcpp LineComment [] (('\n':w)*/*l) ls xs
lexcpp LineComment w l ls xs@('\n':_) = lexcpp Any w l ls xs
lexcpp LineComment w l ls (_:xs) = lexcpp LineComment (' ':w) l ls xs
lexcpp (NestComment _) w l ls ('*':'/':xs)
= lexcpp Any [] (w*/*l) ls xs
lexcpp (NestComment n) w l ls (_:xs) = lexcpp (NestComment n) (' ':w) l
ls xs
lexcpp mode w l ((p,l'):ls) [] = cpp mode next w l p ls ('\n':l')
lexcpp _ _ _ [] [] = []
-- rules to lex non-Haskell, non-cpp text
plaintext :: SubMode -> String -> Posn -> [(Posn,String)]
-> String -> [WordStyle]
plaintext Any acc p ls ('\n':'#':xs) = emit acc $ -- emit "\n" $
cpp Any plaintext [] [] p ls xs
plaintext Any acc p ls ('/':'*':xs)
| stripComments = emit acc $
plaintext CComment " " p ls xs
plaintext Any acc p ls ('/':'/':xs)
| stripEol = emit acc $
plaintext CLineComment " " p ls xs
plaintext Any acc p ls (x:xs) | single x = emit acc $ emit [x] $
plaintext Any [] p ls xs
plaintext Any acc p ls (x:xs) | space x = emit acc $
plaintext (Pred space other) [x]
p ls xs
plaintext Any acc p ls (x:xs) | ident0 x = emit acc $
plaintext (Pred ident1 Ident) [x]
p ls xs
plaintext Any acc p ls (x:xs) = plaintext Any (x:acc) p ls xs
plaintext pre@(Pred pred ws) acc p ls (x:xs)
| pred x = plaintext pre (x:acc) p ls xs
plaintext (Pred _ ws) acc p ls xs = ws p (reverse acc):
plaintext Any [] p ls xs
plaintext CComment acc p ls ('*':'/':xs) = emit (" "++acc) $
plaintext Any [] p ls xs
plaintext CComment acc p ls (_:xs) = plaintext CComment (' ':acc) p ls xs
plaintext CLineComment acc p ls xs@('\n':_)
= emit acc $ plaintext Any [] p ls xs
plaintext CLineComment acc p ls (_:xs)= plaintext CLineComment (' ':acc)
p ls xs
plaintext mode acc _ ((p,l):ls) [] = plaintext mode acc p ls ('\n':l)
plaintext _ acc _ [] [] = emit acc $ []
-- predicates for lexing Haskell.
ident0 x = isAlpha x || x `elem` "_`"
ident1 x = isAlphaNum x || x `elem` "'_`"
symbol x = x `elem` ":!#$%&*+./<=>?@\\^|-~"
single x = x `elem` "(),[];{}"
space x = x `elem` " \t"
-- emit a token (if there is one) from the accumulator
emit "" = id
emit xs = (Other (reverse xs):)
-- add a reversed word to the accumulator
"" */* l = l
w */* l = reverse w : l
-- help out broken Haskell compilers which need balanced numbers of C
-- comments in order to do import chasing :-) -----> */*
-- | Parse a possible macro call, returning argument list and remaining input
parseMacroCall :: Posn -> [WordStyle] -> Maybe ([[WordStyle]],[WordStyle])
parseMacroCall p = call . skip
where
skip (Other x:xs) | all isSpace x = skip xs
skip xss = xss
call (Other "(":xs) = (args (0::Int) [] [] . skip) xs
call _ = Nothing
args 0 w acc ( Other ")" :xs) = Just (reverse (addone w acc), xs)
args 0 w acc ( Other "," :xs) = args 0 [] (addone w acc) (skip xs)
args n w acc (x@(Other "("):xs) = args (n+1) (x:w) acc xs
args n w acc (x@(Other ")"):xs) = args (n-1) (x:w) acc xs
args n w acc ( Ident _ v :xs) = args n (Ident p v:w) acc xs
args n w acc (x@(Other _) :xs) = args n (x:w) acc xs
args _ _ _ _ = Nothing
addone w acc = reverse (skip w): acc
|