From 11e83528df0f4e48d9324b2e4dd82dc58792fb16 Mon Sep 17 00:00:00 2001 From: defanor Date: Tue, 23 Jan 2018 15:28:05 +0300 Subject: Add initial RDF support With two types of caching: file-based for retrieved documents, and memory-based for predicate labels extracted from those. --- Pancake.hs | 40 ++++------ Pancake/Common.hs | 11 ++- Pancake/Reading.hs | 73 ++++++++++++++++-- README | 8 +- Text/Pandoc/Readers/RDF.hs | 188 +++++++++++++++++++++++++++++++++++++++++++++ pancake.1 | 2 +- pancake.cabal | 16 ++-- 7 files changed, 296 insertions(+), 42 deletions(-) create mode 100644 Text/Pandoc/Readers/RDF.hs diff --git a/Pancake.hs b/Pancake.hs index c59a6e1..38b6a7d 100644 --- a/Pancake.hs +++ b/Pancake.hs @@ -52,6 +52,7 @@ import Data.Version import System.Console.GetOpt import Text.Regex.TDFA import qualified System.Console.Haskeline as HL +import Control.Concurrent.STM.TVar import Pancake.Common import Pancake.Configuration @@ -81,6 +82,7 @@ data LoopState = LS { history :: Sliding HistoryEntry , interrupted :: Bool , unclutterRegexps :: [(Regex, String)] , columns :: Maybe Int + , rdfCache :: TVar (M.Map String String) } -- | Main event loop's type. @@ -111,25 +113,20 @@ updateConfig mp = do u <- prepareUnclutter c modify $ \s -> s { conf = c, unclutterRegexps = u } --- | A wrapper around 'retrieve' that adjusts the URI. +-- | A wrapper around 'retrieve''. loadRaw :: URI -> Pancake (URI, Maybe (BS.ByteString, Maybe URI, Maybe String)) loadRaw rawURI = do st <- get - let ddg = isInfixOf "/l/?kh=-1&uddg=" $ uriToString id rawURI "" - adjustedURI = case (ddg, uriIsAbsolute rawURI, history st) of + let adjustedURI -- fix DDG links (that's rather hacky, todo: improve) - (True, _, _) -> fromMaybe rawURI $ + | isInfixOf "/l/?kh=-1&uddg=" $ uriToString id rawURI "" = + fromMaybe rawURI $ parseAbsoluteURI (unEscapeString $ drop 12 (uriQuery rawURI)) - -- handle relative URIs - (_, False, (h:_, _)) -> relativeTo rawURI (hURI h) - _ -> rawURI - uScheme = case uriScheme adjustedURI of - [] -> "unknown" - s -> init s - cmd = fromMaybe (defaultCommand $ conf st) $ - M.lookup uScheme (commands $ conf st) - doc <- liftIO $ retrieve cmd adjustedURI - pure (adjustedURI, doc) + | otherwise = rawURI + currentURI = case history st of + (h:_, _) -> pure (hURI h) + _ -> mzero + retrieve' (conf st) False currentURI adjustedURI -- | Decides what to do with a given URI; either returns a document or -- runs an external viewer. Used by both 'GoTo' and 'Reload'. @@ -153,7 +150,7 @@ loadDocument sType rawURI = do case M.lookup ext (externalViewers $ conf st) of Nothing -> do uDoc <- tryUnclutter (unclutterRegexps st) effectiveURI rawDoc - doc <- readDoc uDoc fType effectiveURI + doc <- readDoc (conf st) (rdfCache st) uDoc fType effectiveURI case doc of Left err -> do putErrLn $ show err @@ -246,7 +243,7 @@ command (Save (RURI uri') p) = do remoteURIStr = uriToString id remoteURI "" remoteFileName' = takeFileName $ uriPath remoteURI remoteFileName = if remoteFileName' `elem` [".", "..", ""] - then map escapeURI remoteURIStr + then escapeURI remoteURI else remoteFileName' targetFileName = fromMaybe remoteFileName mTargetName targetPath = targetDir targetFileName @@ -260,10 +257,6 @@ command (Save (RURI uri') p) = do , encodeSexpStr $ uriToString id uri' "" , encodeSexpStr targetPath] putErrLn $ unwords ["Saved", remoteURIStr, "as", targetPath] - where - escapeURI c - | isPathSeparator c = '-' - | otherwise = c command (Save (RNumber i) p) = do st <- get if length (rLinks $ rendered st) > i @@ -347,10 +340,7 @@ command ShowCurrent = do command (Shortcut u q) = command . GoTo Nothing . RURI . fromJust . parseURI $ u ++ escapeURIString isUnreserved q command (LoadConfig p) = updateConfig p -command Quit = liftIO $ do - dir <- getXdgDirectory XdgCache "pancake" - exists <- doesDirectoryExist dir - when exists $ removeDirectoryRecursive dir +command Quit = pure () command Interrupt = putErrLn "Received SIGINT. Interrupt twice in a row to quit." command (SetWidth w) = modify $ \s -> s { columns = w } @@ -441,6 +431,7 @@ main = do >>= \st -> command (parseCommand (conf st) (unwords cmd)) dir <- getXdgDirectory XdgConfig "pancake" let hfn = dir "command_history" + rdfc <- newTVarIO M.empty _ <- runStateT (HL.runInputT (HL.setComplete complete HL.defaultSettings) @@ -457,6 +448,7 @@ main = do , interrupted = False , unclutterRegexps = [] , columns = Nothing + , rdfCache = rdfc } pure () run diff --git a/Pancake/Common.hs b/Pancake/Common.hs index ee21ab3..d107a02 100644 --- a/Pancake/Common.hs +++ b/Pancake/Common.hs @@ -24,9 +24,11 @@ Portability : portable Utility functions. -} -module Pancake.Common ( putErrLn ) where +module Pancake.Common ( putErrLn, escapeURI ) where import System.IO import Control.Monad.IO.Class +import Network.URI +import System.FilePath -- | Prints a line into stderr. @@ -34,3 +36,10 @@ putErrLn :: MonadIO m => String -> m () putErrLn s = liftIO $ do hPutStrLn stderr s hFlush stderr + +-- | Escapes an URI for use as a file name. +escapeURI :: URI -> FilePath +escapeURI u = map escapeChar $ uriToString id u "" + where escapeChar c + | isPathSeparator c = '-' + | otherwise = c diff --git a/Pancake/Reading.hs b/Pancake/Reading.hs index 52b7c28..7d86219 100644 --- a/Pancake/Reading.hs +++ b/Pancake/Reading.hs @@ -27,6 +27,7 @@ Document retrieval and parsing. {-# LANGUAGE ScopedTypeVariables #-} module Pancake.Reading ( retrieve + , retrieve' , readDoc ) where @@ -49,9 +50,15 @@ import Text.Parsec hiding ((<|>)) import Text.Parsec.ByteString import Data.Maybe import Data.Version +import qualified Data.Map as M +import Control.Monad.IO.Class +import System.Directory +import Control.Concurrent.STM.TVar import Text.Pandoc.Readers.Plain import Text.Pandoc.Readers.Gopher +import Text.Pandoc.Readers.RDF +import Pancake.Configuration import Pancake.Common import Paths_pancake @@ -80,6 +87,46 @@ pWithMeta :: Parser (BS.ByteString, (Maybe URI, Maybe String)) pWithMeta = (,) . BS.pack <$> manyTill anyToken (try $ lookAhead pMeta) <*> pMeta +-- | A wrapper around 'retrieve' that adjusts the URI. +retrieve' :: MonadIO m + => Config + -- ^ Configuration. + -> Bool + -- ^ Cache. + -> Maybe URI + -- ^ Current URI. + -> URI + -- ^ Target URI. + -> m (URI, Maybe (BS.ByteString, Maybe URI, Maybe String)) +retrieve' c cache cu tu' = do + let tu = tu' { uriFragment = "" } + adjustedURI = case (cu, uriIsAbsolute tu) of + (Just cu', False) -> relativeTo tu cu' + _ -> tu + uScheme = case uriScheme adjustedURI of + [] -> "unknown" + s -> init s + cmd = fromMaybe (defaultCommand c) $ M.lookup uScheme (commands c) + doc <- liftIO $ if cache + then do + cacheDir <- getXdgDirectory XdgCache "pancake" + createDirectoryIfMissing True cacheDir + let fp = cacheDir escapeURI tu + exists <- doesFileExist fp + if exists + then do + fc <- BS.readFile fp + pure $ Just (fc, Nothing, Nothing) + else do + d <- retrieve cmd adjustedURI + case d of + (Just (fc, _, _)) -> BS.writeFile fp fc + _ -> pure () + pure d + else retrieve cmd adjustedURI + pure (adjustedURI, doc) + + -- | Retrieves a document. Prints an error message and returns an -- empty string on failure. retrieve :: String @@ -152,16 +199,21 @@ pEmacsMode = do -- | Parses a document into a Pandoc structure. The parser is chosen -- depending on the document type (if one is provided) or its URI. -readDoc :: BS.ByteString +readDoc :: MonadIO m + => Config + -- ^ Configuration. + -> TVar (M.Map String String) + -- ^ RDF cache + -> BS.ByteString -- ^ Raw document data. -> Maybe String -- ^ Document type. -> URI -- ^ Document URI. - -> IO (Either P.PandocError P.Pandoc) + -> m (Either P.PandocError P.Pandoc) -- ^ A parsed document. -readDoc out dt uri = do - term <- setupTermFromEnv +readDoc c rdfc out dt uri = do + term <- liftIO setupTermFromEnv let (reader, exts) = either (const plain) id $ maybe (Left "no type suggestions") byExtension dt <|> case (uriScheme uri, map toLower $ takeExtension $ uriPath uri) of @@ -185,7 +237,7 @@ readDoc out dt uri = do (parse pEmacsMode (uriToString id uri "") out) cols = fromMaybe 80 $ getCapability term termColumns opts = def { P.readerColumns = cols, P.readerExtensions = exts } - case reader of + liftIO $ case reader of P.TextReader f -> case decodeUtf8' out of Left err -> do putErrLn $ show err @@ -197,6 +249,8 @@ readDoc out dt uri = do html = P.getReader "html" plain = (P.TextReader . const $ readPlain, P.emptyExtensions) gopher = pure (P.TextReader . const $ readGopher, P.emptyExtensions) + rdf = ( P.TextReader . const $ readRDF rdfc uri retrieveRelative + , P.emptyExtensions) byExtension' ext = byExtension $ dropWhile (== '.') ext byExtension "md" = P.getReader "markdown" byExtension "htm" = html @@ -204,4 +258,13 @@ readDoc out dt uri = do byExtension "tex" = P.getReader "latex" byExtension "txt" = pure plain byExtension "plain" = pure plain + byExtension "rdf" = pure rdf + byExtension "turtle" = pure rdf byExtension ext = P.getReader ext + retrieveRelative u = do + x <- retrieve' c True (Just uri) u + case x of + (_, Just (bs, _, _)) -> case decodeUtf8' bs of + Right t -> pure t + _ -> fail "Failed to decode as UTF-8" + _ -> fail "Failed to retrieve a document" diff --git a/README b/README index 662e9e9..a7ecec9 100644 --- a/README +++ b/README @@ -5,9 +5,9 @@ Pancake This is a CLI/Emacs web/gopher/file browser. It utilizes pandoc and external downloaders such as curl, adding -support for Gopher directories and plain text files, and invoking -external applications (e.g., image and PDF viewers) depending on its -configuration. +support for Gopher directories, plain text files, and RDF, and +invoking external applications (e.g., image and PDF viewers) depending +on its configuration. User interaction capabilities are rather basic, as it is intended to be combined with software that provides better user interfaces – such @@ -56,7 +56,7 @@ specific websites, e.g. webcomics, and perhaps specific images only. Commands -------- -:quit or EOF: quit pancake, cleaning the cache +:quit or EOF: quit pancake :[: back :]: forward :load config[ ]: load configuration from a specified file or diff --git a/Text/Pandoc/Readers/RDF.hs b/Text/Pandoc/Readers/RDF.hs new file mode 100644 index 0000000..9a7b6de --- /dev/null +++ b/Text/Pandoc/Readers/RDF.hs @@ -0,0 +1,188 @@ +{- +Copyright (C) 2017-2018 defanor + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +-} + +{- | +Module : Text.Pandoc.Readers.RDF +Maintainer : defanor +Stability : unstable + +This module is for RDF documents reading. It's not strictly a parser, +since it requests additional external documents to read predicate +labels from, and generally controls how those would be rendered. +-} + +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +module Text.Pandoc.Readers.RDF ( readRDF ) where + +import Text.Pandoc.Definition +import Text.Pandoc.Class (PandocMonad) +import Data.RDF +import qualified Data.Text as T +import Network.URI (URI, uriFragment, parseURIReference, relativeFrom, + uriToString) +import Data.List (intersperse) +import Control.Monad.IO.Class (liftIO, MonadIO) +import Text.XML.HXT.Core ( readString, withParseHTML, withWarnings, + runX, (>>>), yes, no, XNode(..)) +import Data.Tree.NTree.TypeDefs (NTree(..)) +import Text.XML.HXT.XPath.Arrows (getXPathTreesInDoc) +import Control.Monad.Error.Class (throwError, catchError) +import Text.Pandoc.Error (PandocError(..)) +import Control.Concurrent.STM.TVar (TVar, readTVarIO, modifyTVar) +import Control.Monad.STM (atomically) +import qualified Data.Map as M + + +-- | Reads a literal value. +readLiteral :: LValue -> [Inline] +readLiteral (PlainL t) = pure . Str $ T.unpack t +readLiteral (PlainLL t _) = pure . Str $ T.unpack t +readLiteral (TypedL t _) = pure . Str $ T.unpack t + +-- | Reads an arbitrary 'Node', used mostly for objects and subjects. +readNode :: URI -> Node -> [Inline] +readNode _ (LNode l) = readLiteral l +readNode bu (UNode u) = case parseURIReference (T.unpack u) of + Just u' -> let u'' = relativeFrom u' bu + us = uriToString id u'' "" + in [Link (us, [], []) [] (us, us)] + Nothing -> let s = (T.unpack u) in [Link (s, [], []) [] (s, s)] +readNode _ (BNode t) = let s = T.unpack t + in [Link (s, [], []) [] (s, s)] +readNode _ (BNodeGen i) = let s = show i + in [Link (s, [], []) [] (s, s)] + +-- | Reads a predicate, looks up its label in external documents using +-- the provided retrieval function. +readPredicate :: (MonadIO m, PandocMonad m) + => TVar (M.Map String String) + -- ^ RDF cache + -> URI + -- ^ Base URI. + -> (URI -> m T.Text) + -- ^ Retrieval function. + -> Node + -- ^ A node to read. + -> m [Inline] +readPredicate rdfc _ rf (UNode u) = do + rdfc' <- liftIO $ readTVarIO rdfc + l <- case M.lookup uriStr rdfc' of + Just cl -> pure cl + Nothing -> do + (u', doc) <- case parseURIReference uriStr of + Just r -> do + ret <- rf r + pure (r, ret) + Nothing -> throwError $ PandocParseError $ + "Failed to parse an URI reference" + -- use URIs when failing to read a label + catchError (labelFromRDF u' doc) $ const $ pure uriStr + pure [Link (uriStr, [], []) [] (uriStr, l)] + where + uriStr = T.unpack u + labelFromRDF u' doc = do + rdf <- parseRDF u' rf doc + let label = (UNode "http://www.w3.org/2000/01/rdf-schema#label") + -- 'query' doesn't expand triples, so filtering manually + l <- case filter + (\x -> subjectOf x == (UNode u) && predicateOf x == label) + (expandTriples rdf) of + -- todo: there could be multiple labels in different + -- languages, handle that. + [sl] -> case objectOf sl of + (LNode (PlainL sl')) -> pure $ T.unpack sl' + _ -> pure $ show sl + _ -> pure uriStr + liftIO $ atomically $ modifyTVar rdfc $ M.insert uriStr l + pure l +readPredicate _ bu _ n = pure $ readNode bu n + +-- | Reads a triple. +readTriple :: (MonadIO m, PandocMonad m) + => TVar (M.Map String String) + -- ^ RDF cache + -> URI + -- ^ Base (source) URI. + -> (URI -> m T.Text) + -- ^ Retrieval function. + -> Triple + -- ^ A triple to read. + -> m [Inline] +readTriple rdfc bu rf t = do + p <- readPredicate rdfc bu rf $ predicateOf t + pure $ intersperse Space $ + concat [readNode bu $ subjectOf t, p, readNode bu $ objectOf t] + +-- | Parses an RDF (XML/RDF or Turtle). The provided document may also +-- be an HTML document with an alternate version that is RDF; +-- retrieves it with the provided retrieval function in such a case. +parseRDF :: (MonadIO m, PandocMonad m) + => URI + -- ^ Base (source) URI. + -> (URI -> m T.Text) + -- ^ Retrieval function. + -> T.Text + -- ^ Document to parse. + -> m (RDF AdjHashMap) +parseRDF bu rf t = do + let baseURI = uriToString id bu { uriFragment = "" } "" + -- check link rel + let doc = readString [withParseHTML yes, withWarnings no] (T.unpack t) + rc <- liftIO $ runX $ doc + >>> getXPathTreesInDoc + "//link[@rel=\"alternate\" and @type=\"application/rdf+xml\"]/@href/text()" + case rc of + [NTree (XText uri) []] -> case parseURIReference uri of + Nothing -> throwError $ PandocParseError $ + "Failed to parse an alternate URI" + Just u' -> + if u' /= bu + then do + t' <- rf u' + parseRDF u' rf t' + else throwError $ PandocSomeError $ + "A loop is detected in alternate document versions." + _ -> do + let burl = T.pack baseURI + tryParse :: (Rdf a, RdfParser p) => p -> Either ParseFailure (RDF a) + tryParse p = parseString p t + parsed :: Either ParseFailure (RDF AdjHashMap) + parsed = -- todo: alternatives should be used here + either + (const $ either (const $ tryParse NTriplesParser) pure + (tryParse $ TurtleParser (Just (BaseUrl burl)) (Just burl))) + pure + (tryParse $ XmlParser (Just (BaseUrl burl)) (Just burl)) + case parsed of + Left err -> throwError $ PandocParseError $ show err + Right x -> pure x + +readRDF :: (MonadIO m, PandocMonad m) + => TVar (M.Map String String) + -- ^ RDF cache + -> URI + -- ^ Base (source) URI. + -> (URI -> m T.Text) + -- ^ Retrieval function. + -> T.Text + -- ^ Document to parse. + -> m Pandoc +readRDF rdfc bu rf t = do + rdf <- parseRDF bu rf t + Pandoc mempty . pure . LineBlock + <$> mapM (readTriple rdfc bu rf) (expandTriples rdf) diff --git a/pancake.1 b/pancake.1 index d82bca7..09cdcc6 100644 --- a/pancake.1 +++ b/pancake.1 @@ -32,7 +32,7 @@ Load configuration from a specified file. .SH COMMANDS .IP "\fBquit\fR or EOF" -quit pancake, cleaning the cache +quit pancake .IP "\fB[\fR" back .IP "\fB]\fR" diff --git a/pancake.cabal b/pancake.cabal index dbd0f78..0cc2f41 100644 --- a/pancake.cabal +++ b/pancake.cabal @@ -1,11 +1,10 @@ name: pancake version: 0.1.9 -synopsis: A CLI web/gopher/file browser -description: Pancake is a CLI web/gopher/file browser inspired - by Line Mode Browser. It relies on pandoc for - parsing, on curl and other external applications - for data loading, on emacs and other applications - to provide user interfaces. +synopsis: A CLI/Emacs web/gopher/file browser +description: Pancake is a CLI/Emacs web browser. It relies on + pandoc for parsing, on curl and other external + applications for data loading, on emacs and other + applications to provide user interfaces. tested-with: GHC == 8.0.1, GHC == 8.0.2, GHC == 8.2.1, GHC == 8.2.2 license: GPL-3 license-file: LICENSE @@ -50,7 +49,9 @@ executable pancake , data-default >= 0.7.1.1 && < 1 , directory >= 1.2.6.2 && < 2 , filepath >= 1.4.1.0 && < 2 + , haskeline >= 0.7 && < 1 , hxt >= 9.3.1 && < 10 + , hxt-xpath >= 9.1.2.2 , hxt-xslt >= 9.1 && < 10 , mtl >= 2.2.1 && < 3 , network-uri >= 2.6.1.0 && < 3 @@ -58,13 +59,14 @@ executable pancake , pandoc-types >= 1.17.0.5 && < 2 , parsec >= 3.1.11 && < 4 , process >= 1.6 && < 2 + , rdf4h >= 3 && < 4 , regex-base >= 0.93.2 && < 1 , regex-tdfa >= 1.2.2 && < 2 + , stm >= 2.4 && < 3 , terminfo >= 0.4.0.2 && < 1 , text >= 1.2.2.2 && < 2 , unix >= 2.7.2.0 && < 3 , yaml >= 0.8.23.3 && < 1 - , haskeline >= 0.7 && < 1 -- hs-source-dirs: default-language: Haskell2010 ghc-options: -Wall -- cgit v1.2.3