summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Pancake.hs13
-rw-r--r--Pancake/Configuration.hs3
-rw-r--r--Pancake/Unclutter.hs92
-rw-r--r--README36
-rw-r--r--pancake.cabal5
5 files changed, 145 insertions, 4 deletions
diff --git a/Pancake.hs b/Pancake.hs
index 55bdf8d..129585d 100644
--- a/Pancake.hs
+++ b/Pancake.hs
@@ -52,6 +52,7 @@ import Data.Version
import System.Console.GetOpt
import System.Posix.Signals
import Control.Concurrent
+import Text.Regex.TDFA
import Pancake.Common
import Pancake.Configuration
@@ -59,6 +60,7 @@ import Pancake.Command
import Pancake.Reading
import Pancake.Rendering
import Pancake.Printing
+import Pancake.Unclutter
import Paths_pancake
-- | A zipper kind of thing, for scrolling and history traversal.
@@ -71,7 +73,8 @@ data LoopState = LS { history :: Sliding (URI, P.Pandoc)
, conf :: Config
, embedded :: Bool
, interrupted :: Bool
- } deriving (Show)
+ , unclutterRegexps :: [(Regex, String)]
+ }
-- | Renders a parsed document.
printDoc :: MonadIO m => URI -> P.Pandoc -> StateT LoopState m ()
@@ -95,7 +98,8 @@ printDoc uri doc = do
updateConfig :: MonadIO m => StateT LoopState m ()
updateConfig = do
c <- loadConfig
- modify $ \s -> s { conf = c }
+ u <- prepareUnclutter c
+ modify $ \s -> s { conf = c, unclutterRegexps = u }
-- | A wrapper around 'retrieve' that adjusts the URI.
loadRaw :: MonadIO m => URI ->
@@ -140,7 +144,8 @@ loadDocument sType rawURI = do
(_, other) -> other
case M.lookup ext (externalViewers $ conf st) of
Nothing -> do
- doc <- readDoc rawDoc fType effectiveURI
+ uDoc <- tryUnclutter (unclutterRegexps st) effectiveURI rawDoc
+ doc <- readDoc uDoc fType effectiveURI
case doc of
Left err -> do
putErrLn $ show err
@@ -348,7 +353,7 @@ main = do
tid <- myThreadId
_ <- installHandler sigINT (Catch (throwTo tid UserInterrupt)) Nothing
let run e = runStateT (updateConfig >> eventLoop)
- (LS ([],[]) 0 [] def e False)
+ (LS ([],[]) 0 [] def e False [])
>> pure ()
case getOpt Permute options args of
([], [], []) -> run False
diff --git a/Pancake/Configuration.hs b/Pancake/Configuration.hs
index 2748907..ae42f65 100644
--- a/Pancake/Configuration.hs
+++ b/Pancake/Configuration.hs
@@ -67,6 +67,8 @@ data Config = Config { commands :: M.Map String String
, indentDivs :: Bool
-- ^ Whether to add indentation for elements
-- inside divs.
+ , unclutter :: M.Map String String
+ -- ^ XSLT file and URI regex.
} deriving (Generic, Show, Eq)
-- | For configuration parsing.
@@ -107,6 +109,7 @@ instance Default Config where
, historyDepth = 100
, referenceDigits = "0123456789"
, indentDivs = False
+ , unclutter = M.empty
}
where
curl = "curl --compressed -4 -L " ++
diff --git a/Pancake/Unclutter.hs b/Pancake/Unclutter.hs
new file mode 100644
index 0000000..9cf2331
--- /dev/null
+++ b/Pancake/Unclutter.hs
@@ -0,0 +1,92 @@
+{-
+Copyright (C) 2017 defanor <defanor@uberspace.net>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+-}
+
+{- |
+Module : Pancake.Unclutter
+Maintainer : defanor <defanor@uberspace.net>
+Stability : unstable
+Portability : non-portable (GHC extensions are used)
+
+An XSLT-based data extraction module.
+-}
+
+{-# LANGUAGE TupleSections #-}
+
+module Pancake.Unclutter ( tryUnclutter
+ , prepareUnclutter
+ ) where
+
+import qualified Data.ByteString.Char8 as BS
+import Control.Monad.IO.Class
+import qualified Data.Map as M
+import Network.URI
+import Data.Either
+import System.Directory
+import System.FilePath
+import Data.List
+import Text.Regex.TDFA.String
+import Text.Regex.Base.RegexLike
+import Text.XML.HXT.Core (readString, withParseHTML, runX, XmlTree, (>>>), yes)
+import Text.XML.HXT.DOM.ShowXml (xshow)
+import Text.XML.HXT.XSLT (xsltApplyStylesheetFromURI)
+import Data.Text.Encoding (decodeUtf8', decodeLatin1, encodeUtf8)
+import qualified Data.Text as T
+
+
+import Pancake.Common
+import Pancake.Configuration
+
+-- | Tries to unclutter a document.
+tryUnclutter :: MonadIO m
+ => [(Regex, String)]
+ -- ^ Obtained with 'prepareUnclutter'.
+ -> URI
+ -- ^ Document URI.
+ -> BS.ByteString
+ -- ^ Raw document.
+ -> m BS.ByteString
+tryUnclutter rs u d = do
+ let matches (r, _) = case execute r uStr of
+ Right (Just _) -> True
+ _ -> False
+ case find matches rs of
+ Just (_, fn) -> liftIO $ do
+ dir <- getXdgDirectory XdgConfig "pancake"
+ let src = dir </> "unclutter" </> fn <.> "xsl"
+ exists <- doesFileExist src
+ if exists
+ then do
+ let txt = T.unpack $ either (const $ decodeLatin1 d) id $ decodeUtf8' d
+ doc = readString [withParseHTML yes] txt
+ rc <- runX (doc >>> xsltApplyStylesheetFromURI src) :: IO [XmlTree]
+ pure $ case rc of
+ [] -> d
+ _ -> encodeUtf8 $ T.pack $ xshow rc
+ else pure d
+ Nothing -> pure d
+ where uStr = uriToString id u ""
+
+-- | Compiles regexps for uncluttering.
+prepareUnclutter :: MonadIO m => Config -> m [(Regex, String)]
+prepareUnclutter c = do
+ let re = map
+ (\(f, r) -> fmap (, f) (compile defaultCompOpt defaultExecOpt r))
+ $ M.toList $ unclutter c
+ errs = lefts re
+ compiled = rights re
+ mapM_ putErrLn errs
+ pure compiled
diff --git a/README b/README
index c5a13db..b292f73 100644
--- a/README
+++ b/README
@@ -92,6 +92,8 @@ simplified)::
commands:
gopher: torify curl "${URI}"
ssh: scp "${URI_REGNAME}:${URI_PATH}" /dev/stdout
+ unclutter:
+ duckduckgo: ^https://duckduckgo\.com/lite/\?q=
defaultCommand: ! 'curl --compressed -4 -L -w "
-pancake-
@@ -115,6 +117,40 @@ simplified)::
paginate: true
+Uncluttering
+~~~~~~~~~~~~
+
+XSLT can be used to extract useful data from HTML documents. In the
+above sample configuration, ``duckduckgo`` is defined along with an
+URI regex to determine when it should be applied, and the
+corresponding XSLT file should be in
+``~/.config/pancake/unclutter/duckduckgo.xsl``::
+
+ <?xml version="1.0" encoding="UTF-8"?>
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:output method="xml" indent="yes"/>
+ <xsl:template match="/">
+ <html>
+ <body>
+ <xsl:for-each select="//tr[not(@class) or @class!='result-sponsored']">
+ <xsl:for-each select="td/a[@class='result-link']">
+ <br />
+ <br />
+ <a href="{@href}">
+ <xsl:value-of select="." />
+ </a>
+ </xsl:for-each>
+ <xsl:for-each select="td[@class='result-snippet']">
+ <br />
+ <xsl:value-of select="." />
+ </xsl:for-each>
+ </xsl:for-each>
+ </body>
+ </html>
+ </xsl:template>
+ </xsl:stylesheet>
+
+
Screenshots
-----------
diff --git a/pancake.cabal b/pancake.cabal
index 5f052f4..211f5ea 100644
--- a/pancake.cabal
+++ b/pancake.cabal
@@ -41,6 +41,7 @@ executable pancake
, Pancake.Rendering
, Pancake.Printing
, Pancake.Command
+ , Pancake.Unclutter
, Paths_pancake
build-depends: base >= 4.9 && < 5
, bytestring >= 0.10.8.1 && < 1
@@ -48,12 +49,16 @@ executable pancake
, data-default >= 0.7.1.1 && < 1
, directory >= 1.2.6.2 && < 2
, filepath >= 1.4.1.0 && < 2
+ , hxt >= 9.3.1 && < 10
+ , hxt-xslt >= 9.1 && < 10
, mtl >= 2.2.1 && < 3
, network-uri >= 2.6.1.0 && < 3
, pandoc >= 2 && < 3
, pandoc-types >= 1.17.0.5 && < 2
, parsec >= 3.1.11 && < 4
, process >= 1.6 && < 2
+ , regex-base >= 0.93.2 && < 1
+ , regex-tdfa >= 1.2.2 && < 2
, terminfo >= 0.4.0.2 && < 1
, text >= 1.2.2.2 && < 2
, unix >= 2.7.2.0 && < 3