From f735c69d6d0d13858588c3c35317ef294f2382d6 Mon Sep 17 00:00:00 2001 From: defanor Date: Wed, 13 Dec 2017 07:46:46 +0300 Subject: Add uncluttering/XSLT support --- Pancake.hs | 13 ++++--- Pancake/Configuration.hs | 3 ++ Pancake/Unclutter.hs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ README | 36 +++++++++++++++++++ pancake.cabal | 5 +++ 5 files changed, 145 insertions(+), 4 deletions(-) create mode 100644 Pancake/Unclutter.hs diff --git a/Pancake.hs b/Pancake.hs index 55bdf8d..129585d 100644 --- a/Pancake.hs +++ b/Pancake.hs @@ -52,6 +52,7 @@ import Data.Version import System.Console.GetOpt import System.Posix.Signals import Control.Concurrent +import Text.Regex.TDFA import Pancake.Common import Pancake.Configuration @@ -59,6 +60,7 @@ import Pancake.Command import Pancake.Reading import Pancake.Rendering import Pancake.Printing +import Pancake.Unclutter import Paths_pancake -- | A zipper kind of thing, for scrolling and history traversal. @@ -71,7 +73,8 @@ data LoopState = LS { history :: Sliding (URI, P.Pandoc) , conf :: Config , embedded :: Bool , interrupted :: Bool - } deriving (Show) + , unclutterRegexps :: [(Regex, String)] + } -- | Renders a parsed document. printDoc :: MonadIO m => URI -> P.Pandoc -> StateT LoopState m () @@ -95,7 +98,8 @@ printDoc uri doc = do updateConfig :: MonadIO m => StateT LoopState m () updateConfig = do c <- loadConfig - modify $ \s -> s { conf = c } + u <- prepareUnclutter c + modify $ \s -> s { conf = c, unclutterRegexps = u } -- | A wrapper around 'retrieve' that adjusts the URI. loadRaw :: MonadIO m => URI -> @@ -140,7 +144,8 @@ loadDocument sType rawURI = do (_, other) -> other case M.lookup ext (externalViewers $ conf st) of Nothing -> do - doc <- readDoc rawDoc fType effectiveURI + uDoc <- tryUnclutter (unclutterRegexps st) effectiveURI rawDoc + doc <- readDoc uDoc fType effectiveURI case doc of Left err -> do putErrLn $ show err @@ -348,7 +353,7 @@ main = do tid <- myThreadId _ <- installHandler sigINT (Catch (throwTo tid UserInterrupt)) Nothing let run e = runStateT (updateConfig >> eventLoop) - (LS ([],[]) 0 [] def e False) + (LS ([],[]) 0 [] def e False []) >> pure () case getOpt Permute options args of ([], [], []) -> run False diff --git a/Pancake/Configuration.hs b/Pancake/Configuration.hs index 2748907..ae42f65 100644 --- a/Pancake/Configuration.hs +++ b/Pancake/Configuration.hs @@ -67,6 +67,8 @@ data Config = Config { commands :: M.Map String String , indentDivs :: Bool -- ^ Whether to add indentation for elements -- inside divs. + , unclutter :: M.Map String String + -- ^ XSLT file and URI regex. } deriving (Generic, Show, Eq) -- | For configuration parsing. @@ -107,6 +109,7 @@ instance Default Config where , historyDepth = 100 , referenceDigits = "0123456789" , indentDivs = False + , unclutter = M.empty } where curl = "curl --compressed -4 -L " ++ diff --git a/Pancake/Unclutter.hs b/Pancake/Unclutter.hs new file mode 100644 index 0000000..9cf2331 --- /dev/null +++ b/Pancake/Unclutter.hs @@ -0,0 +1,92 @@ +{- +Copyright (C) 2017 defanor + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +-} + +{- | +Module : Pancake.Unclutter +Maintainer : defanor +Stability : unstable +Portability : non-portable (GHC extensions are used) + +An XSLT-based data extraction module. +-} + +{-# LANGUAGE TupleSections #-} + +module Pancake.Unclutter ( tryUnclutter + , prepareUnclutter + ) where + +import qualified Data.ByteString.Char8 as BS +import Control.Monad.IO.Class +import qualified Data.Map as M +import Network.URI +import Data.Either +import System.Directory +import System.FilePath +import Data.List +import Text.Regex.TDFA.String +import Text.Regex.Base.RegexLike +import Text.XML.HXT.Core (readString, withParseHTML, runX, XmlTree, (>>>), yes) +import Text.XML.HXT.DOM.ShowXml (xshow) +import Text.XML.HXT.XSLT (xsltApplyStylesheetFromURI) +import Data.Text.Encoding (decodeUtf8', decodeLatin1, encodeUtf8) +import qualified Data.Text as T + + +import Pancake.Common +import Pancake.Configuration + +-- | Tries to unclutter a document. +tryUnclutter :: MonadIO m + => [(Regex, String)] + -- ^ Obtained with 'prepareUnclutter'. + -> URI + -- ^ Document URI. + -> BS.ByteString + -- ^ Raw document. + -> m BS.ByteString +tryUnclutter rs u d = do + let matches (r, _) = case execute r uStr of + Right (Just _) -> True + _ -> False + case find matches rs of + Just (_, fn) -> liftIO $ do + dir <- getXdgDirectory XdgConfig "pancake" + let src = dir "unclutter" fn <.> "xsl" + exists <- doesFileExist src + if exists + then do + let txt = T.unpack $ either (const $ decodeLatin1 d) id $ decodeUtf8' d + doc = readString [withParseHTML yes] txt + rc <- runX (doc >>> xsltApplyStylesheetFromURI src) :: IO [XmlTree] + pure $ case rc of + [] -> d + _ -> encodeUtf8 $ T.pack $ xshow rc + else pure d + Nothing -> pure d + where uStr = uriToString id u "" + +-- | Compiles regexps for uncluttering. +prepareUnclutter :: MonadIO m => Config -> m [(Regex, String)] +prepareUnclutter c = do + let re = map + (\(f, r) -> fmap (, f) (compile defaultCompOpt defaultExecOpt r)) + $ M.toList $ unclutter c + errs = lefts re + compiled = rights re + mapM_ putErrLn errs + pure compiled diff --git a/README b/README index c5a13db..b292f73 100644 --- a/README +++ b/README @@ -92,6 +92,8 @@ simplified):: commands: gopher: torify curl "${URI}" ssh: scp "${URI_REGNAME}:${URI_PATH}" /dev/stdout + unclutter: + duckduckgo: ^https://duckduckgo\.com/lite/\?q= defaultCommand: ! 'curl --compressed -4 -L -w " -pancake- @@ -115,6 +117,40 @@ simplified):: paginate: true +Uncluttering +~~~~~~~~~~~~ + +XSLT can be used to extract useful data from HTML documents. In the +above sample configuration, ``duckduckgo`` is defined along with an +URI regex to determine when it should be applied, and the +corresponding XSLT file should be in +``~/.config/pancake/unclutter/duckduckgo.xsl``:: + + + + + + + + + +
+
+ + + +
+ +
+ +
+
+ + +
+
+ + Screenshots ----------- diff --git a/pancake.cabal b/pancake.cabal index 5f052f4..211f5ea 100644 --- a/pancake.cabal +++ b/pancake.cabal @@ -41,6 +41,7 @@ executable pancake , Pancake.Rendering , Pancake.Printing , Pancake.Command + , Pancake.Unclutter , Paths_pancake build-depends: base >= 4.9 && < 5 , bytestring >= 0.10.8.1 && < 1 @@ -48,12 +49,16 @@ executable pancake , data-default >= 0.7.1.1 && < 1 , directory >= 1.2.6.2 && < 2 , filepath >= 1.4.1.0 && < 2 + , hxt >= 9.3.1 && < 10 + , hxt-xslt >= 9.1 && < 10 , mtl >= 2.2.1 && < 3 , network-uri >= 2.6.1.0 && < 3 , pandoc >= 2 && < 3 , pandoc-types >= 1.17.0.5 && < 2 , parsec >= 3.1.11 && < 4 , process >= 1.6 && < 2 + , regex-base >= 0.93.2 && < 1 + , regex-tdfa >= 1.2.2 && < 2 , terminfo >= 0.4.0.2 && < 1 , text >= 1.2.2.2 && < 2 , unix >= 2.7.2.0 && < 3 -- cgit v1.2.3