From 9939aa23bb11b22be9e570c972be6a898b9df8eb Mon Sep 17 00:00:00 2001 From: defanor Date: Wed, 21 Feb 2018 18:55:51 +0300 Subject: Use Redland bindings instead of rdf4h It's faster, includes a proper triple store with querying, and generally should be more useful. --- Pancake.hs | 6 +- Pancake/Reading.hs | 7 +- Text/Pandoc/Readers/RDF.hs | 211 +++++++++++++++++---------------------------- pancake.cabal | 3 +- 4 files changed, 82 insertions(+), 145 deletions(-) diff --git a/Pancake.hs b/Pancake.hs index 38b6a7d..5420530 100644 --- a/Pancake.hs +++ b/Pancake.hs @@ -52,7 +52,6 @@ import Data.Version import System.Console.GetOpt import Text.Regex.TDFA import qualified System.Console.Haskeline as HL -import Control.Concurrent.STM.TVar import Pancake.Common import Pancake.Configuration @@ -82,7 +81,6 @@ data LoopState = LS { history :: Sliding HistoryEntry , interrupted :: Bool , unclutterRegexps :: [(Regex, String)] , columns :: Maybe Int - , rdfCache :: TVar (M.Map String String) } -- | Main event loop's type. @@ -150,7 +148,7 @@ loadDocument sType rawURI = do case M.lookup ext (externalViewers $ conf st) of Nothing -> do uDoc <- tryUnclutter (unclutterRegexps st) effectiveURI rawDoc - doc <- readDoc (conf st) (rdfCache st) uDoc fType effectiveURI + doc <- readDoc (conf st) uDoc fType effectiveURI case doc of Left err -> do putErrLn $ show err @@ -431,7 +429,6 @@ main = do >>= \st -> command (parseCommand (conf st) (unwords cmd)) dir <- getXdgDirectory XdgConfig "pancake" let hfn = dir "command_history" - rdfc <- newTVarIO M.empty _ <- runStateT (HL.runInputT (HL.setComplete complete HL.defaultSettings) @@ -448,7 +445,6 @@ main = do , interrupted = False , unclutterRegexps = [] , columns = Nothing - , rdfCache = rdfc } pure () run diff --git a/Pancake/Reading.hs b/Pancake/Reading.hs index 3a83e46..0124ec9 100644 --- a/Pancake/Reading.hs +++ b/Pancake/Reading.hs @@ -53,7 +53,6 @@ import Data.Version import qualified Data.Map as M import Control.Monad.IO.Class import System.Directory -import Control.Concurrent.STM.TVar import System.Timeout import Text.Pandoc.Readers.Plain @@ -203,8 +202,6 @@ pEmacsMode = do readDoc :: MonadIO m => Config -- ^ Configuration. - -> TVar (M.Map String String) - -- ^ RDF cache -> BS.ByteString -- ^ Raw document data. -> Maybe String @@ -213,7 +210,7 @@ readDoc :: MonadIO m -- ^ Document URI. -> m (Either P.PandocError P.Pandoc) -- ^ A parsed document. -readDoc c rdfc out dt uri = do +readDoc c out dt uri = do term <- liftIO setupTermFromEnv let (reader, exts) = either (const plain) id $ maybe (Left "no type suggestions") byExtension dt @@ -251,7 +248,7 @@ readDoc c rdfc out dt uri = do html = P.getReader "html" plain = (P.TextReader . const $ readPlain, P.emptyExtensions) gopher = pure (P.TextReader . const $ readGopher, P.emptyExtensions) - rdf = ( P.TextReader . const $ readRDF rdfc uri retrieveRelative + rdf = ( P.TextReader . const $ readRDF uri retrieveRelative , P.emptyExtensions) byExtension' ext = byExtension $ dropWhile (== '.') ext byExtension "md" = P.getReader "markdown" diff --git a/Text/Pandoc/Readers/RDF.hs b/Text/Pandoc/Readers/RDF.hs index 9a7b6de..081b71e 100644 --- a/Text/Pandoc/Readers/RDF.hs +++ b/Text/Pandoc/Readers/RDF.hs @@ -22,7 +22,8 @@ Stability : unstable This module is for RDF documents reading. It's not strictly a parser, since it requests additional external documents to read predicate -labels from, and generally controls how those would be rendered. +labels from, and looks up for alternate versions, and decides how to +render those. -} {-# LANGUAGE OverloadedStrings #-} @@ -31,158 +32,102 @@ module Text.Pandoc.Readers.RDF ( readRDF ) where import Text.Pandoc.Definition import Text.Pandoc.Class (PandocMonad) -import Data.RDF +-- import Data.RDF import qualified Data.Text as T -import Network.URI (URI, uriFragment, parseURIReference, relativeFrom, - uriToString) +import Network.URI (URI, parseURIReference, relativeFrom, uriToString) import Data.List (intersperse) import Control.Monad.IO.Class (liftIO, MonadIO) import Text.XML.HXT.Core ( readString, withParseHTML, withWarnings, runX, (>>>), yes, no, XNode(..)) import Data.Tree.NTree.TypeDefs (NTree(..)) import Text.XML.HXT.XPath.Arrows (getXPathTreesInDoc) -import Control.Monad.Error.Class (throwError, catchError) +import Control.Monad.Error.Class (throwError) import Text.Pandoc.Error (PandocError(..)) -import Control.Concurrent.STM.TVar (TVar, readTVarIO, modifyTVar) -import Control.Monad.STM (atomically) -import qualified Data.Map as M +import Control.Exception +import System.Directory +import Foreign +import Redland.MidLevel +import Redland.LowLevel +import Redland.Util --- | Reads a literal value. -readLiteral :: LValue -> [Inline] -readLiteral (PlainL t) = pure . Str $ T.unpack t -readLiteral (PlainLL t _) = pure . Str $ T.unpack t -readLiteral (TypedL t _) = pure . Str $ T.unpack t --- | Reads an arbitrary 'Node', used mostly for objects and subjects. -readNode :: URI -> Node -> [Inline] -readNode _ (LNode l) = readLiteral l -readNode bu (UNode u) = case parseURIReference (T.unpack u) of - Just u' -> let u'' = relativeFrom u' bu - us = uriToString id u'' "" - in [Link (us, [], []) [] (us, us)] - Nothing -> let s = (T.unpack u) in [Link (s, [], []) [] (s, s)] -readNode _ (BNode t) = let s = T.unpack t - in [Link (s, [], []) [] (s, s)] -readNode _ (BNodeGen i) = let s = show i - in [Link (s, [], []) [] (s, s)] - --- | Reads a predicate, looks up its label in external documents using --- the provided retrieval function. -readPredicate :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI - -- ^ Base URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> Node - -- ^ A node to read. - -> m [Inline] -readPredicate rdfc _ rf (UNode u) = do - rdfc' <- liftIO $ readTVarIO rdfc - l <- case M.lookup uriStr rdfc' of - Just cl -> pure cl - Nothing -> do - (u', doc) <- case parseURIReference uriStr of - Just r -> do - ret <- rf r - pure (r, ret) - Nothing -> throwError $ PandocParseError $ - "Failed to parse an URI reference" - -- use URIs when failing to read a label - catchError (labelFromRDF u' doc) $ const $ pure uriStr - pure [Link (uriStr, [], []) [] (uriStr, l)] - where - uriStr = T.unpack u - labelFromRDF u' doc = do - rdf <- parseRDF u' rf doc - let label = (UNode "http://www.w3.org/2000/01/rdf-schema#label") - -- 'query' doesn't expand triples, so filtering manually - l <- case filter - (\x -> subjectOf x == (UNode u) && predicateOf x == label) - (expandTriples rdf) of - -- todo: there could be multiple labels in different - -- languages, handle that. - [sl] -> case objectOf sl of - (LNode (PlainL sl')) -> pure $ T.unpack sl' - _ -> pure $ show sl - _ -> pure uriStr - liftIO $ atomically $ modifyTVar rdfc $ M.insert uriStr l - pure l -readPredicate _ bu _ n = pure $ readNode bu n - --- | Reads a triple. -readTriple :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI - -- ^ Base (source) URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> Triple - -- ^ A triple to read. - -> m [Inline] -readTriple rdfc bu rf t = do - p <- readPredicate rdfc bu rf $ predicateOf t - pure $ intersperse Space $ - concat [readNode bu $ subjectOf t, p, readNode bu $ objectOf t] - --- | Parses an RDF (XML/RDF or Turtle). The provided document may also --- be an HTML document with an alternate version that is RDF; --- retrieves it with the provided retrieval function in such a case. -parseRDF :: (MonadIO m, PandocMonad m) - => URI - -- ^ Base (source) URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> T.Text - -- ^ Document to parse. - -> m (RDF AdjHashMap) -parseRDF bu rf t = do - let baseURI = uriToString id bu { uriFragment = "" } "" - -- check link rel +detectAlternateVersion :: T.Text -> IO (Maybe URI) +detectAlternateVersion t = do let doc = readString [withParseHTML yes, withWarnings no] (T.unpack t) rc <- liftIO $ runX $ doc >>> getXPathTreesInDoc "//link[@rel=\"alternate\" and @type=\"application/rdf+xml\"]/@href/text()" - case rc of - [NTree (XText uri) []] -> case parseURIReference uri of - Nothing -> throwError $ PandocParseError $ - "Failed to parse an alternate URI" - Just u' -> - if u' /= bu - then do - t' <- rf u' - parseRDF u' rf t' - else throwError $ PandocSomeError $ - "A loop is detected in alternate document versions." - _ -> do - let burl = T.pack baseURI - tryParse :: (Rdf a, RdfParser p) => p -> Either ParseFailure (RDF a) - tryParse p = parseString p t - parsed :: Either ParseFailure (RDF AdjHashMap) - parsed = -- todo: alternatives should be used here - either - (const $ either (const $ tryParse NTriplesParser) pure - (tryParse $ TurtleParser (Just (BaseUrl burl)) (Just burl))) - pure - (tryParse $ XmlParser (Just (BaseUrl burl)) (Just burl)) - case parsed of - Left err -> throwError $ PandocParseError $ show err - Right x -> pure x + pure $ case rc of + [NTree (XText uri) []] -> parseURIReference uri + _ -> Nothing + readRDF :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI + => URI -- ^ Base (source) URI. -> (URI -> m T.Text) -- ^ Retrieval function. -> T.Text -- ^ Document to parse. -> m Pandoc -readRDF rdfc bu rf t = do - rdf <- parseRDF bu rf t - Pandoc mempty . pure . LineBlock - <$> mapM (readTriple rdfc bu rf) (expandTriples rdf) +readRDF bu rf t = do + alt <- liftIO $ detectAlternateVersion t + case alt of + Nothing -> do + r <- liftIO $ handle handleRE parseDoc + case r of + Left e -> throwError $ PandocParseError e + Right r' -> pure r' + Just alt' -> do + newDoc <- rf alt' + readRDF alt' rf newDoc + where + handleRE :: RedlandException -> IO (Either String Pandoc) + handleRE e = pure $ Left $ show e + parseDoc :: IO (Either String Pandoc) + parseDoc = Right . Pandoc mempty . pure . LineBlock <$> do + withWSMU "memory" [] "temporary" "" (uriToString id bu "") $ + \world _ model uri -> do + tryParseStringIntoModel world ["turtle", "rdfxml", "n-triples"] + model uri (T.unpack t) + withQuery world model "sparql" "SELECT ?s ?p ?o WHERE { ?s ?p ?o }" + (Just uri) $ \results -> do + cacheDir <- getXdgDirectory XdgCache "pancake" + createDirectoryIfMissing True cacheDir + withWSMU "hashes" [("hash-type", "bdb"), ("dir", cacheDir)] + "rdf-cache" "" (uriToString id bu "") $ \world' _ model' uri' -> + mapM (printTriple (world', model', uri')) results + -- todo: should probably use statements instead, need to complete + -- that part of the bindings. + printTriple :: ( ForeignPtr RedlandWorld + , ForeignPtr RedlandModel + , ForeignPtr RedlandURI) + -> [(String, Node)] + -> IO [Inline] + printTriple wmu l = + concat . intersperse [Space] <$> mapM (printNode wmu . snd) l + printNode :: ( ForeignPtr RedlandWorld + , ForeignPtr RedlandModel + , ForeignPtr RedlandURI) + -> Node + -> IO [Inline] + printNode _ (BlankNode s) = pure [Str s] + printNode _ (LiteralNode s) = pure [Str s] + printNode (w, m, u) (ResourceNode s) = + let su = showURI s + q = "SELECT ?label WHERE { <" ++ s ++ + "> ?label }" + in do + l <- withQuery w m "sparql" q (Just u) $ \r -> + case r of + ([("label", LiteralNode label)]:_) -> pure label + _ -> pure su + pure [Link (su, [], []) [] (su, l)] + showURI :: String -> String + showURI u = case parseURIReference u of + Just u' -> uriToString id (relativeFrom u' bu) "" + Nothing -> u + +-- rdfproc rdf-cache parse http://xmlns.com/foaf/0.1/ +-- rdfproc rdf-cache parse http://www.w3.org/1999/02/22-rdf-syntax-ns diff --git a/pancake.cabal b/pancake.cabal index 0cc2f41..a1455c0 100644 --- a/pancake.cabal +++ b/pancake.cabal @@ -59,14 +59,13 @@ executable pancake , pandoc-types >= 1.17.0.5 && < 2 , parsec >= 3.1.11 && < 4 , process >= 1.6 && < 2 - , rdf4h >= 3 && < 4 , regex-base >= 0.93.2 && < 1 , regex-tdfa >= 1.2.2 && < 2 - , stm >= 2.4 && < 3 , terminfo >= 0.4.0.2 && < 1 , text >= 1.2.2.2 && < 2 , unix >= 2.7.2.0 && < 3 , yaml >= 0.8.23.3 && < 1 + , redland == 0.1.0.0 -- hs-source-dirs: default-language: Haskell2010 ghc-options: -Wall -- cgit v1.2.3