From f735c69d6d0d13858588c3c35317ef294f2382d6 Mon Sep 17 00:00:00 2001 From: defanor Date: Wed, 13 Dec 2017 07:46:46 +0300 Subject: Add uncluttering/XSLT support --- Pancake/Configuration.hs | 3 ++ Pancake/Unclutter.hs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 Pancake/Unclutter.hs (limited to 'Pancake') diff --git a/Pancake/Configuration.hs b/Pancake/Configuration.hs index 2748907..ae42f65 100644 --- a/Pancake/Configuration.hs +++ b/Pancake/Configuration.hs @@ -67,6 +67,8 @@ data Config = Config { commands :: M.Map String String , indentDivs :: Bool -- ^ Whether to add indentation for elements -- inside divs. + , unclutter :: M.Map String String + -- ^ XSLT file and URI regex. } deriving (Generic, Show, Eq) -- | For configuration parsing. @@ -107,6 +109,7 @@ instance Default Config where , historyDepth = 100 , referenceDigits = "0123456789" , indentDivs = False + , unclutter = M.empty } where curl = "curl --compressed -4 -L " ++ diff --git a/Pancake/Unclutter.hs b/Pancake/Unclutter.hs new file mode 100644 index 0000000..9cf2331 --- /dev/null +++ b/Pancake/Unclutter.hs @@ -0,0 +1,92 @@ +{- +Copyright (C) 2017 defanor + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +-} + +{- | +Module : Pancake.Unclutter +Maintainer : defanor +Stability : unstable +Portability : non-portable (GHC extensions are used) + +An XSLT-based data extraction module. +-} + +{-# LANGUAGE TupleSections #-} + +module Pancake.Unclutter ( tryUnclutter + , prepareUnclutter + ) where + +import qualified Data.ByteString.Char8 as BS +import Control.Monad.IO.Class +import qualified Data.Map as M +import Network.URI +import Data.Either +import System.Directory +import System.FilePath +import Data.List +import Text.Regex.TDFA.String +import Text.Regex.Base.RegexLike +import Text.XML.HXT.Core (readString, withParseHTML, runX, XmlTree, (>>>), yes) +import Text.XML.HXT.DOM.ShowXml (xshow) +import Text.XML.HXT.XSLT (xsltApplyStylesheetFromURI) +import Data.Text.Encoding (decodeUtf8', decodeLatin1, encodeUtf8) +import qualified Data.Text as T + + +import Pancake.Common +import Pancake.Configuration + +-- | Tries to unclutter a document. +tryUnclutter :: MonadIO m + => [(Regex, String)] + -- ^ Obtained with 'prepareUnclutter'. + -> URI + -- ^ Document URI. + -> BS.ByteString + -- ^ Raw document. + -> m BS.ByteString +tryUnclutter rs u d = do + let matches (r, _) = case execute r uStr of + Right (Just _) -> True + _ -> False + case find matches rs of + Just (_, fn) -> liftIO $ do + dir <- getXdgDirectory XdgConfig "pancake" + let src = dir "unclutter" fn <.> "xsl" + exists <- doesFileExist src + if exists + then do + let txt = T.unpack $ either (const $ decodeLatin1 d) id $ decodeUtf8' d + doc = readString [withParseHTML yes] txt + rc <- runX (doc >>> xsltApplyStylesheetFromURI src) :: IO [XmlTree] + pure $ case rc of + [] -> d + _ -> encodeUtf8 $ T.pack $ xshow rc + else pure d + Nothing -> pure d + where uStr = uriToString id u "" + +-- | Compiles regexps for uncluttering. +prepareUnclutter :: MonadIO m => Config -> m [(Regex, String)] +prepareUnclutter c = do + let re = map + (\(f, r) -> fmap (, f) (compile defaultCompOpt defaultExecOpt r)) + $ M.toList $ unclutter c + errs = lefts re + compiled = rights re + mapM_ putErrLn errs + pure compiled -- cgit v1.2.3