Add Folder scrape feed
This commit is contained in:
parent
9b25c3dba1
commit
1d7a83b6d5
4 changed files with 104 additions and 1 deletions
82
packages/rssfeeds/Folders.hs
Normal file
82
packages/rssfeeds/Folders.hs
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
module Main (main) where
|
||||||
|
|
||||||
|
import Data.List.Extra qualified as List
|
||||||
|
import Data.String.Interpolate (i)
|
||||||
|
import Data.Text qualified as Text
|
||||||
|
import Data.Time (UTCTime)
|
||||||
|
import Data.Time qualified as Time
|
||||||
|
import Network.Wreq qualified as Wreq
|
||||||
|
import Relude
|
||||||
|
import Relude.Extra ((^.))
|
||||||
|
import Text.Atom.Feed qualified as Feed
|
||||||
|
import Text.Atom.Feed.Export qualified as Feed
|
||||||
|
import Text.HTML.TagSoup qualified as TagSoup
|
||||||
|
import Witch (into)
|
||||||
|
|
||||||
|
extractItem :: [TagSoup.Tag Text] -> Maybe (Text, Text, UTCTime)
|
||||||
|
extractItem x = case ( List.firstJust extractFolder . filter (TagSoup.isTagOpenName "a") $ x
|
||||||
|
, TagSoup.maybeTagText <=< (find TagSoup.isTagText . dropWhile (\tag -> not (TagSoup.isTagOpenName "td" tag && "date" == TagSoup.fromAttrib "class" tag))) $ x
|
||||||
|
) of
|
||||||
|
(Just (link, title), Just date_str) | Just date <- Time.parseTimeM True Time.defaultTimeLocale "%Y-%b-%d %H:%M" (toString date_str) -> Just (link, title, date)
|
||||||
|
_ -> Nothing
|
||||||
|
|
||||||
|
extractIndex :: Text -> IO [(Text, Text, UTCTime)]
|
||||||
|
extractIndex = \url ->
|
||||||
|
mapMaybe extractItem
|
||||||
|
. List.split (TagSoup.isTagOpenName "tr")
|
||||||
|
. dropWhile (not . TagSoup.isTagOpenName "tbody")
|
||||||
|
. TagSoup.parseTags
|
||||||
|
. decodeUtf8
|
||||||
|
. (^. Wreq.responseBody)
|
||||||
|
<$> Wreq.get [i|#{url}|]
|
||||||
|
|
||||||
|
collectEntries :: Text -> (Text, Text, UTCTime) -> IO [(Feed.Entry)]
|
||||||
|
collectEntries = \url (link, title, date) ->
|
||||||
|
if Text.isSuffixOf "/" link
|
||||||
|
then do
|
||||||
|
entries <- extractIndex [i|#{url}#{link}|]
|
||||||
|
join <$> forM entries (collectEntries [i|#{url}#{link}|])
|
||||||
|
else
|
||||||
|
pure
|
||||||
|
[ ( ( Feed.nullEntry
|
||||||
|
(Text.dropAround (== '/') link)
|
||||||
|
(Feed.TextString title)
|
||||||
|
(timestamp date)
|
||||||
|
)
|
||||||
|
{ Feed.entryLinks = [Feed.nullLink [i|#{url}#{link}|]]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
[root_dir] <- getArgs
|
||||||
|
folders <- extractIndex (toText root_dir)
|
||||||
|
feeds <- forM folders \x@(link, title, date) -> do
|
||||||
|
let path = Text.dropAround (== '/') link
|
||||||
|
entries <- collectEntries [i|#{root_dir}/|] x
|
||||||
|
let num = length entries
|
||||||
|
emptyFeed =
|
||||||
|
Feed.nullFeed
|
||||||
|
[i|#{path}-#{timestamp date}|]
|
||||||
|
(Feed.TextString title)
|
||||||
|
(timestamp date)
|
||||||
|
feed = fromMaybe (error "Could not produce feed.") $ Feed.textFeed emptyFeed{Feed.feedEntries = entries}
|
||||||
|
writeFileLText [i|#{title}.xml|] feed
|
||||||
|
pure [i|<li><a href="#{path}.xml">#{title} (#{num} entries, newest from #{timestamp date})</a></li>|]
|
||||||
|
writeFileText [i|index.html|] $
|
||||||
|
Text.unlines $
|
||||||
|
["<!DOCTYPE html>", "<html>", "<head>", "<title>Available RSS Feeds</title>", "</head>", "<body>", "<h1>Available RSS Feeds</h1>", "<ul>"] ++ feeds ++ ["</ul>", "</body>", "</html>"]
|
||||||
|
|
||||||
|
timestamp :: UTCTime -> Text
|
||||||
|
timestamp = into . Time.formatTime Time.defaultTimeLocale "%Y-%m-%d %H:%M"
|
||||||
|
|
||||||
|
extractFolder :: TagSoup.Tag Text -> Maybe (Text, Text)
|
||||||
|
extractFolder = \tag ->
|
||||||
|
let
|
||||||
|
title = TagSoup.fromAttrib "title" tag
|
||||||
|
href = TagSoup.fromAttrib "href" tag
|
||||||
|
in
|
||||||
|
if Text.null title || Text.null href
|
||||||
|
then Nothing
|
||||||
|
else Just (href, title)
|
|
@ -20,6 +20,7 @@
|
||||||
text,
|
text,
|
||||||
time,
|
time,
|
||||||
witch,
|
witch,
|
||||||
|
wreq,
|
||||||
}:
|
}:
|
||||||
mkDerivation {
|
mkDerivation {
|
||||||
pname = "rssfeeds";
|
pname = "rssfeeds";
|
||||||
|
@ -47,6 +48,7 @@ mkDerivation {
|
||||||
text
|
text
|
||||||
time
|
time
|
||||||
witch
|
witch
|
||||||
|
wreq
|
||||||
];
|
];
|
||||||
license = "unknown";
|
license = "unknown";
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
cradle:
|
cradle:
|
||||||
cabal:
|
cabal: {}
|
||||||
|
|
|
@ -74,6 +74,25 @@ executable mail2rss
|
||||||
|
|
||||||
default-language: GHC2021
|
default-language: GHC2021
|
||||||
|
|
||||||
|
executable folders2rss
|
||||||
|
import: common-options
|
||||||
|
main-is: Folders.hs
|
||||||
|
build-depends:
|
||||||
|
, base
|
||||||
|
, containers
|
||||||
|
, extra
|
||||||
|
, feed >=1.3.0.0
|
||||||
|
, filepattern
|
||||||
|
, relude
|
||||||
|
, string-interpolate
|
||||||
|
, tagsoup
|
||||||
|
, text
|
||||||
|
, time
|
||||||
|
, witch
|
||||||
|
, wreq
|
||||||
|
|
||||||
|
default-language: GHC2021
|
||||||
|
|
||||||
executable mastodon2rss
|
executable mastodon2rss
|
||||||
import: common-options
|
import: common-options
|
||||||
main-is: Mastodon.hs
|
main-is: Mastodon.hs
|
||||||
|
|
Loading…
Reference in a new issue