summaryrefslogtreecommitdiff
path: root/Pancake/Reading.hs
blob: 3a83e4614aaa6cd46174497123aa928392855c09 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
{-
Copyright (C) 2017-2018 defanor <defanor@uberspace.net>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
-}

{- |
Module      :  Pancake.Reading
Maintainer  :  defanor <defanor@uberspace.net>
Stability   :  unstable
Portability :  non-portable (GHC extensions are used)

Document retrieval and parsing.
-}

{-# LANGUAGE ScopedTypeVariables #-}

module Pancake.Reading ( retrieve
                       , retrieve'
                       , readDoc
                       ) where

import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.Char8 as BS
import Network.URI
import qualified Text.Pandoc as P
import System.Process
import Control.Exception (handle, SomeException)
import Control.Applicative ((<|>))
import Data.Text.Encoding (decodeUtf8', decodeLatin1)
import Data.Default
import System.Console.Terminfo (setupTermFromEnv, getCapability, termColumns)
import System.FilePath
import Data.Char
import System.Exit
import System.Environment
import GHC.IO.Handle
import Text.Parsec hiding ((<|>))
import Text.Parsec.ByteString
import Data.Maybe
import Data.Version
import qualified Data.Map as M
import Control.Monad.IO.Class
import System.Directory
import Control.Concurrent.STM.TVar
import System.Timeout

import Text.Pandoc.Readers.Plain
import Text.Pandoc.Readers.Gopher
import Text.Pandoc.Readers.RDF
import Pancake.Configuration
import Pancake.Common
import Paths_pancake


-- | Metadata (header, URI, document type) parser.
pMeta :: Parser (Maybe URI, Maybe String)
pMeta = do
  _ <- newline
  _ <- string "-pancake-"
  _ <- newline
  u <- optionMaybe $ do
    _ <- string "uri: "
    u <- manyTill anyToken newline
    maybe (fail "Failed to parse URI") pure $ parseURI u
  t <- option Nothing $ do
    _ <- string "type: "
    optional $ try $ manyTill alphaNum (char '/')
    t <- optionMaybe $ many1 $ choice [alphaNum, char '-']
    _ <- manyTill anyToken newline
    pure t
  eof
  pure (u, t)

-- | Document body + metadata parser.
pWithMeta :: Parser (BS.ByteString, (Maybe URI, Maybe String))
pWithMeta = (,) . BS.pack <$> manyTill anyToken (try $ lookAhead pMeta)
            <*> pMeta

-- | A wrapper around 'retrieve' that adjusts the URI.
retrieve' :: MonadIO m
          => Config
          -- ^ Configuration.
          -> Bool
          -- ^ Cache.
          -> Maybe URI
          -- ^ Current URI.
          -> URI
          -- ^ Target URI.
          -> m (URI, Maybe (BS.ByteString, Maybe URI, Maybe String))
retrieve' c cache cu tu' = do
  let tu = tu' { uriFragment = "" }
      adjustedURI = case (cu, uriIsAbsolute tu) of
        (Just cu', False) -> relativeTo tu cu'
        _ -> tu
      uScheme = case uriScheme adjustedURI of
        [] -> "unknown"
        s -> init s
      cmd = fromMaybe (defaultCommand c) $ M.lookup uScheme (commands c)
  doc <- liftIO $ if cache
    then do
    cacheDir <- getXdgDirectory XdgCache "pancake"
    createDirectoryIfMissing True cacheDir
    let fp = cacheDir </> escapeURI tu
    exists <- doesFileExist fp
    if exists
      then do
      fc <- BS.readFile fp
      pure $ Just (fc, Nothing, Nothing)
      else do
      d <- retrieve cmd adjustedURI
      case d of
        (Just (fc, _, _)) -> BS.writeFile fp fc
        _ -> pure ()
      pure d
    else retrieve cmd adjustedURI
  pure (adjustedURI, doc)


-- | Retrieves a document. Prints an error message and returns an
-- empty string on failure.
retrieve :: String
         -- ^ Shell command to use for retrieval.
         -> URI
         -- ^ Document URI.
         -> IO (Maybe (BS.ByteString, Maybe URI, Maybe String))
         -- ^ File contents, effective URI, type.
retrieve cmd uri = do
  putErrLn $ "Retrieving " ++ show uri
  curEnv <- getEnvironment
  let envAuthority = maybe [] (\x -> [ ("URI_USERINFO", uriUserInfo x)
                                     , ("URI_REGNAME", uriRegName x)
                                     , ("URI_PORT", uriPort x) ])
        (uriAuthority uri)
      uriStr = uriToString id uri ""
      environment = ("URI", uriStr)
                    : ("URI_ESCAPED", escapeURIString isUnreserved uriStr)
                    : ("URI_SCHEME", uriScheme uri)
                    : ("URI_PATH", uriPath uri)
                    : ("URI_QUERY", uriQuery uri)
                    : ("URI_FRAGMENT", uriFragment uri)
                    : ("PANCAKE", showVersion version)
                    : curEnv
                    ++ envAuthority
  handle (\(e :: SomeException) ->
            putErrLn (concat ["Failed to run `", cmd, "`: ", show e])
            >> pure Nothing) $
    withCreateProcess ((shell cmd) { env = Just environment
                                   , std_out = CreatePipe
                                   , std_err = CreatePipe
                                   , delegate_ctlc = True }) $
    \_ stdout stderr ph -> case stdout of
      Nothing -> putErrLn "No stdout" >> pure Nothing
      Just stdout' -> do
        hSetBinaryMode stdout' True
        out <- BS.hGetContents stdout'
        ec <- waitForProcess ph
        if ec /= ExitSuccess
          then do
          putErrLn $ "An error occurred. Exit code: " ++ show ec
          case stderr of
            Nothing -> pure ()
            Just stderr' -> do
              err <- BS.hGetContents stderr'
              putErrLn $ "stderr:\n" ++ BS.unpack err
          else putErrLn $ show uri
        case parse pWithMeta (uriToString id uri "") out of
          Left _ -> pure $ Just (out, Nothing, Nothing)
          Right (bs, (u, t)) -> pure $ Just (bs, u, t)

-- | An Emacs file variable parser. Extracts a mode if it's set.
pEmacsMode :: Parser String
pEmacsMode = do
  _ <- manyTill (noneOf "\r\n") (string "-*-")
  spaces
  vs <- try fileVariable `sepEndBy` (char ';' >> spaces)
  spaces
  _ <- string "-*-"
  maybe (fail "no mode variable found") pure (lookup "mode" vs)
  where
    fileVariable :: Parser (String, String)
    fileVariable = do
      -- this is restrictive, but should suffice for idiomatic names
      name <- many1 (choice [alphaNum, char '-'])
      char ':' >> spaces
      val <- reverse . dropWhile isSpace . reverse <$> manyTill anyChar
        (choice $ map (try . lookAhead . string) [";", "\n", "-*-"])
      pure (name, val)

-- | Parses a document into a Pandoc structure. The parser is chosen
-- depending on the document type (if one is provided) or its URI.
readDoc :: MonadIO m
        => Config
        -- ^ Configuration.
        -> TVar (M.Map String String)
        -- ^ RDF cache
        -> BS.ByteString
        -- ^ Raw document data.
        -> Maybe String
        -- ^ Document type.
        -> URI
        -- ^ Document URI.
        -> m (Either P.PandocError P.Pandoc)
        -- ^ A parsed document.
readDoc c rdfc out dt uri = do
  term <- liftIO setupTermFromEnv
  let (reader, exts) = either (const plain) id $
        maybe (Left "no type suggestions") byExtension dt
        <|> case (uriScheme uri, map toLower $ takeExtension $ uriPath uri) of
              -- some exceptions and special cases (might be better to make
              -- this configurable)
              ("http:", ext) -> http ext
              ("https:", ext) -> http ext
              ("gopher:", ext) -> case uriPath uri of
                ('/':'1':_) -> gopher
                ('/':'h':_) -> html
                -- "0" should indicate plain text, but it's also the most
                -- suitable option for non-html markup. Not sure about this
                -- approach, but it's similar to ignoring HTTP content-type,
                -- and will do for now: better to render documents nicely
                -- when possible.
                ('/':'0':_) -> byExtension' ext
                -- unknown or unrecognized item type
                _ -> byExtension' ext <|> gopher
              (_, ext) -> byExtension' ext
        <|> either (Left . show) byExtension
        (parse pEmacsMode (uriToString id uri "") out)
      cols = fromMaybe 80 $ getCapability term termColumns
      opts = def { P.readerColumns = cols, P.readerExtensions = exts }
  r <- liftIO $ timeout (pandocTimeout c * 1000000) $ case reader of
    P.TextReader f -> case decodeUtf8' out of
      Left err -> do
        putErrLn $ show err
        P.runIO $ f opts $ decodeLatin1 out
      Right r -> P.runIO $ f opts r
    P.ByteStringReader f -> P.runIO $ f opts $ BL.fromStrict out
  pure $ fromMaybe (Left (P.PandocSomeError "Timed out.")) r
  where
    http ext = byExtension' ext <|> html
    html = P.getReader "html"
    plain = (P.TextReader . const $ readPlain, P.emptyExtensions)
    gopher = pure (P.TextReader . const $ readGopher, P.emptyExtensions)
    rdf = ( P.TextReader . const $ readRDF rdfc uri retrieveRelative
          , P.emptyExtensions)
    byExtension' ext = byExtension $ dropWhile (== '.') ext
    byExtension "md" = P.getReader "markdown"
    byExtension "htm" = html
    byExtension "ltx" = P.getReader "latex"
    byExtension "tex" = P.getReader "latex"
    byExtension "txt" = pure plain
    byExtension "plain" = pure plain
    byExtension "rdf" = pure rdf
    byExtension "turtle" = pure rdf
    byExtension ext = P.getReader ext
    retrieveRelative u = do
      x <- retrieve' c True (Just uri) u
      case x of
        (_, Just (bs, _, _)) -> case decodeUtf8' bs of
          Right t -> pure t
          _ -> fail "Failed to decode as UTF-8"
        _ -> fail "Failed to retrieve a document"