From 9939aa23bb11b22be9e570c972be6a898b9df8eb Mon Sep 17 00:00:00 2001 From: defanor Date: Wed, 21 Feb 2018 18:55:51 +0300 Subject: Use Redland bindings instead of rdf4h It's faster, includes a proper triple store with querying, and generally should be more useful. --- Text/Pandoc/Readers/RDF.hs | 211 +++++++++++++++++---------------------------- 1 file changed, 78 insertions(+), 133 deletions(-) (limited to 'Text/Pandoc') diff --git a/Text/Pandoc/Readers/RDF.hs b/Text/Pandoc/Readers/RDF.hs index 9a7b6de..081b71e 100644 --- a/Text/Pandoc/Readers/RDF.hs +++ b/Text/Pandoc/Readers/RDF.hs @@ -22,7 +22,8 @@ Stability : unstable This module is for RDF documents reading. It's not strictly a parser, since it requests additional external documents to read predicate -labels from, and generally controls how those would be rendered. +labels from, and looks up for alternate versions, and decides how to +render those. -} {-# LANGUAGE OverloadedStrings #-} @@ -31,158 +32,102 @@ module Text.Pandoc.Readers.RDF ( readRDF ) where import Text.Pandoc.Definition import Text.Pandoc.Class (PandocMonad) -import Data.RDF +-- import Data.RDF import qualified Data.Text as T -import Network.URI (URI, uriFragment, parseURIReference, relativeFrom, - uriToString) +import Network.URI (URI, parseURIReference, relativeFrom, uriToString) import Data.List (intersperse) import Control.Monad.IO.Class (liftIO, MonadIO) import Text.XML.HXT.Core ( readString, withParseHTML, withWarnings, runX, (>>>), yes, no, XNode(..)) import Data.Tree.NTree.TypeDefs (NTree(..)) import Text.XML.HXT.XPath.Arrows (getXPathTreesInDoc) -import Control.Monad.Error.Class (throwError, catchError) +import Control.Monad.Error.Class (throwError) import Text.Pandoc.Error (PandocError(..)) -import Control.Concurrent.STM.TVar (TVar, readTVarIO, modifyTVar) -import Control.Monad.STM (atomically) -import qualified Data.Map as M +import Control.Exception +import System.Directory +import Foreign +import Redland.MidLevel +import Redland.LowLevel +import Redland.Util --- | Reads a literal value. -readLiteral :: LValue -> [Inline] -readLiteral (PlainL t) = pure . Str $ T.unpack t -readLiteral (PlainLL t _) = pure . Str $ T.unpack t -readLiteral (TypedL t _) = pure . Str $ T.unpack t --- | Reads an arbitrary 'Node', used mostly for objects and subjects. -readNode :: URI -> Node -> [Inline] -readNode _ (LNode l) = readLiteral l -readNode bu (UNode u) = case parseURIReference (T.unpack u) of - Just u' -> let u'' = relativeFrom u' bu - us = uriToString id u'' "" - in [Link (us, [], []) [] (us, us)] - Nothing -> let s = (T.unpack u) in [Link (s, [], []) [] (s, s)] -readNode _ (BNode t) = let s = T.unpack t - in [Link (s, [], []) [] (s, s)] -readNode _ (BNodeGen i) = let s = show i - in [Link (s, [], []) [] (s, s)] - --- | Reads a predicate, looks up its label in external documents using --- the provided retrieval function. -readPredicate :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI - -- ^ Base URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> Node - -- ^ A node to read. - -> m [Inline] -readPredicate rdfc _ rf (UNode u) = do - rdfc' <- liftIO $ readTVarIO rdfc - l <- case M.lookup uriStr rdfc' of - Just cl -> pure cl - Nothing -> do - (u', doc) <- case parseURIReference uriStr of - Just r -> do - ret <- rf r - pure (r, ret) - Nothing -> throwError $ PandocParseError $ - "Failed to parse an URI reference" - -- use URIs when failing to read a label - catchError (labelFromRDF u' doc) $ const $ pure uriStr - pure [Link (uriStr, [], []) [] (uriStr, l)] - where - uriStr = T.unpack u - labelFromRDF u' doc = do - rdf <- parseRDF u' rf doc - let label = (UNode "http://www.w3.org/2000/01/rdf-schema#label") - -- 'query' doesn't expand triples, so filtering manually - l <- case filter - (\x -> subjectOf x == (UNode u) && predicateOf x == label) - (expandTriples rdf) of - -- todo: there could be multiple labels in different - -- languages, handle that. - [sl] -> case objectOf sl of - (LNode (PlainL sl')) -> pure $ T.unpack sl' - _ -> pure $ show sl - _ -> pure uriStr - liftIO $ atomically $ modifyTVar rdfc $ M.insert uriStr l - pure l -readPredicate _ bu _ n = pure $ readNode bu n - --- | Reads a triple. -readTriple :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI - -- ^ Base (source) URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> Triple - -- ^ A triple to read. - -> m [Inline] -readTriple rdfc bu rf t = do - p <- readPredicate rdfc bu rf $ predicateOf t - pure $ intersperse Space $ - concat [readNode bu $ subjectOf t, p, readNode bu $ objectOf t] - --- | Parses an RDF (XML/RDF or Turtle). The provided document may also --- be an HTML document with an alternate version that is RDF; --- retrieves it with the provided retrieval function in such a case. -parseRDF :: (MonadIO m, PandocMonad m) - => URI - -- ^ Base (source) URI. - -> (URI -> m T.Text) - -- ^ Retrieval function. - -> T.Text - -- ^ Document to parse. - -> m (RDF AdjHashMap) -parseRDF bu rf t = do - let baseURI = uriToString id bu { uriFragment = "" } "" - -- check link rel +detectAlternateVersion :: T.Text -> IO (Maybe URI) +detectAlternateVersion t = do let doc = readString [withParseHTML yes, withWarnings no] (T.unpack t) rc <- liftIO $ runX $ doc >>> getXPathTreesInDoc "//link[@rel=\"alternate\" and @type=\"application/rdf+xml\"]/@href/text()" - case rc of - [NTree (XText uri) []] -> case parseURIReference uri of - Nothing -> throwError $ PandocParseError $ - "Failed to parse an alternate URI" - Just u' -> - if u' /= bu - then do - t' <- rf u' - parseRDF u' rf t' - else throwError $ PandocSomeError $ - "A loop is detected in alternate document versions." - _ -> do - let burl = T.pack baseURI - tryParse :: (Rdf a, RdfParser p) => p -> Either ParseFailure (RDF a) - tryParse p = parseString p t - parsed :: Either ParseFailure (RDF AdjHashMap) - parsed = -- todo: alternatives should be used here - either - (const $ either (const $ tryParse NTriplesParser) pure - (tryParse $ TurtleParser (Just (BaseUrl burl)) (Just burl))) - pure - (tryParse $ XmlParser (Just (BaseUrl burl)) (Just burl)) - case parsed of - Left err -> throwError $ PandocParseError $ show err - Right x -> pure x + pure $ case rc of + [NTree (XText uri) []] -> parseURIReference uri + _ -> Nothing + readRDF :: (MonadIO m, PandocMonad m) - => TVar (M.Map String String) - -- ^ RDF cache - -> URI + => URI -- ^ Base (source) URI. -> (URI -> m T.Text) -- ^ Retrieval function. -> T.Text -- ^ Document to parse. -> m Pandoc -readRDF rdfc bu rf t = do - rdf <- parseRDF bu rf t - Pandoc mempty . pure . LineBlock - <$> mapM (readTriple rdfc bu rf) (expandTriples rdf) +readRDF bu rf t = do + alt <- liftIO $ detectAlternateVersion t + case alt of + Nothing -> do + r <- liftIO $ handle handleRE parseDoc + case r of + Left e -> throwError $ PandocParseError e + Right r' -> pure r' + Just alt' -> do + newDoc <- rf alt' + readRDF alt' rf newDoc + where + handleRE :: RedlandException -> IO (Either String Pandoc) + handleRE e = pure $ Left $ show e + parseDoc :: IO (Either String Pandoc) + parseDoc = Right . Pandoc mempty . pure . LineBlock <$> do + withWSMU "memory" [] "temporary" "" (uriToString id bu "") $ + \world _ model uri -> do + tryParseStringIntoModel world ["turtle", "rdfxml", "n-triples"] + model uri (T.unpack t) + withQuery world model "sparql" "SELECT ?s ?p ?o WHERE { ?s ?p ?o }" + (Just uri) $ \results -> do + cacheDir <- getXdgDirectory XdgCache "pancake" + createDirectoryIfMissing True cacheDir + withWSMU "hashes" [("hash-type", "bdb"), ("dir", cacheDir)] + "rdf-cache" "" (uriToString id bu "") $ \world' _ model' uri' -> + mapM (printTriple (world', model', uri')) results + -- todo: should probably use statements instead, need to complete + -- that part of the bindings. + printTriple :: ( ForeignPtr RedlandWorld + , ForeignPtr RedlandModel + , ForeignPtr RedlandURI) + -> [(String, Node)] + -> IO [Inline] + printTriple wmu l = + concat . intersperse [Space] <$> mapM (printNode wmu . snd) l + printNode :: ( ForeignPtr RedlandWorld + , ForeignPtr RedlandModel + , ForeignPtr RedlandURI) + -> Node + -> IO [Inline] + printNode _ (BlankNode s) = pure [Str s] + printNode _ (LiteralNode s) = pure [Str s] + printNode (w, m, u) (ResourceNode s) = + let su = showURI s + q = "SELECT ?label WHERE { <" ++ s ++ + "> ?label }" + in do + l <- withQuery w m "sparql" q (Just u) $ \r -> + case r of + ([("label", LiteralNode label)]:_) -> pure label + _ -> pure su + pure [Link (su, [], []) [] (su, l)] + showURI :: String -> String + showURI u = case parseURIReference u of + Just u' -> uriToString id (relativeFrom u' bu) "" + Nothing -> u + +-- rdfproc rdf-cache parse http://xmlns.com/foaf/0.1/ +-- rdfproc rdf-cache parse http://www.w3.org/1999/02/22-rdf-syntax-ns -- cgit v1.2.3