summaryrefslogtreecommitdiff
path: root/Pancake/Unclutter.hs
blob: 2169e00dc538a69a8abaecb193df152d5dd388b6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
{-
Copyright (C) 2017  defanor <defanor@uberspace.net>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
-}

{- |
Module      :  Pancake.Unclutter
Maintainer  :  defanor <defanor@uberspace.net>
Stability   :  unstable
Portability :  non-portable (GHC extensions are used)

An XSLT-based data extraction module.
-}

{-# LANGUAGE TupleSections #-}

module Pancake.Unclutter ( tryUnclutter
                         , prepareUnclutter
                         ) where

import qualified Data.ByteString.Char8 as BS
import Control.Monad.IO.Class
import qualified Data.Map as M
import Network.URI
import Data.Either
import System.Directory
import System.FilePath
import Data.List
import Text.Regex.TDFA.String
import Text.Regex.Base.RegexLike
import Text.XML.HXT.Core ( writeDocumentToString, readString
                         , withParseHTML, withWarnings, runX, (>>>), yes, no)
import Text.XML.HXT.XSLT (xsltApplyStylesheetFromURI)
import Data.Text.Encoding (decodeUtf8', decodeLatin1, encodeUtf8)
import qualified Data.Text as T
import Control.Exception

import Pancake.Common
import Pancake.Configuration
import Paths_pancake

-- | Tries to unclutter a document by applying an XSLT if it's
-- available. Looks for a file in a user config directory first, and
-- in the system data directory then.
tryUnclutter :: MonadIO m
             => [(Regex, String)]
             -- ^ Obtained with 'prepareUnclutter'.
             -> URI
             -- ^ Document URI.
             -> BS.ByteString
             -- ^ Raw document.
             -> m BS.ByteString
tryUnclutter rs u d = liftIO $ handle err $ do
  let matches (r, _) = case execute r uStr of
        Right (Just _) -> True
        _ -> False
  case find matches rs of
    Just (_, fn) -> do
      src <- do
        configDir <- getXdgDirectory XdgConfig "pancake"
        let src = configDir </> "unclutter" </> fn <.> "xsl"
        exists <- doesFileExist src
        dataDir <- getDataDir
        pure $ if exists then src else dataDir </> "unclutter" </> fn <.> "xsl"
      exists <- doesFileExist src
      if exists
        then do
        let txt = T.unpack $ either (const $ decodeLatin1 d) id $ decodeUtf8' d
            doc = readString [withParseHTML yes, withWarnings no] txt
        rc <- runX $ doc
              >>> xsltApplyStylesheetFromURI src
              >>> writeDocumentToString []
        pure $ case rc of
          [str] -> encodeUtf8 $ T.pack str
          _ -> d
        else pure d
    Nothing -> pure d
  where
    uStr = uriToString id u ""
    err :: SomeException -> IO BS.ByteString
    err e = putErrLn (show e) >> pure d

-- | Compiles regexps for uncluttering.
prepareUnclutter :: MonadIO m => Config -> m [(Regex, String)]
prepareUnclutter c = do
  let re = map
           (\(f, r) -> fmap (, f) (compile defaultCompOpt defaultExecOpt r))
           $ M.toList $ unclutter c
      errs = lefts re
      compiled = rights re
  mapM_ putErrLn errs
  pure compiled