1
0
Fork 0

Add Folder scrape feed

This commit is contained in:
Malte 2023-03-22 04:12:07 +01:00
parent 9b25c3dba1
commit 1d7a83b6d5
4 changed files with 104 additions and 1 deletions

View file

@ -0,0 +1,82 @@
module Main (main) where
import Data.List.Extra qualified as List
import Data.String.Interpolate (i)
import Data.Text qualified as Text
import Data.Time (UTCTime)
import Data.Time qualified as Time
import Network.Wreq qualified as Wreq
import Relude
import Relude.Extra ((^.))
import Text.Atom.Feed qualified as Feed
import Text.Atom.Feed.Export qualified as Feed
import Text.HTML.TagSoup qualified as TagSoup
import Witch (into)
extractItem :: [TagSoup.Tag Text] -> Maybe (Text, Text, UTCTime)
extractItem x = case ( List.firstJust extractFolder . filter (TagSoup.isTagOpenName "a") $ x
, TagSoup.maybeTagText <=< (find TagSoup.isTagText . dropWhile (\tag -> not (TagSoup.isTagOpenName "td" tag && "date" == TagSoup.fromAttrib "class" tag))) $ x
) of
(Just (link, title), Just date_str) | Just date <- Time.parseTimeM True Time.defaultTimeLocale "%Y-%b-%d %H:%M" (toString date_str) -> Just (link, title, date)
_ -> Nothing
extractIndex :: Text -> IO [(Text, Text, UTCTime)]
extractIndex = \url ->
mapMaybe extractItem
. List.split (TagSoup.isTagOpenName "tr")
. dropWhile (not . TagSoup.isTagOpenName "tbody")
. TagSoup.parseTags
. decodeUtf8
. (^. Wreq.responseBody)
<$> Wreq.get [i|#{url}|]
collectEntries :: Text -> (Text, Text, UTCTime) -> IO [(Feed.Entry)]
collectEntries = \url (link, title, date) ->
if Text.isSuffixOf "/" link
then do
entries <- extractIndex [i|#{url}#{link}|]
join <$> forM entries (collectEntries [i|#{url}#{link}|])
else
pure
[ ( ( Feed.nullEntry
(Text.dropAround (== '/') link)
(Feed.TextString title)
(timestamp date)
)
{ Feed.entryLinks = [Feed.nullLink [i|#{url}#{link}|]]
}
)
]
main :: IO ()
main = do
[root_dir] <- getArgs
folders <- extractIndex (toText root_dir)
feeds <- forM folders \x@(link, title, date) -> do
let path = Text.dropAround (== '/') link
entries <- collectEntries [i|#{root_dir}/|] x
let num = length entries
emptyFeed =
Feed.nullFeed
[i|#{path}-#{timestamp date}|]
(Feed.TextString title)
(timestamp date)
feed = fromMaybe (error "Could not produce feed.") $ Feed.textFeed emptyFeed{Feed.feedEntries = entries}
writeFileLText [i|#{title}.xml|] feed
pure [i|<li><a href="#{path}.xml">#{title} (#{num} entries, newest from #{timestamp date})</a></li>|]
writeFileText [i|index.html|] $
Text.unlines $
["<!DOCTYPE html>", "<html>", "<head>", "<title>Available RSS Feeds</title>", "</head>", "<body>", "<h1>Available RSS Feeds</h1>", "<ul>"] ++ feeds ++ ["</ul>", "</body>", "</html>"]
timestamp :: UTCTime -> Text
timestamp = into . Time.formatTime Time.defaultTimeLocale "%Y-%m-%d %H:%M"
extractFolder :: TagSoup.Tag Text -> Maybe (Text, Text)
extractFolder = \tag ->
let
title = TagSoup.fromAttrib "title" tag
href = TagSoup.fromAttrib "href" tag
in
if Text.null title || Text.null href
then Nothing
else Just (href, title)

View file

@ -20,6 +20,7 @@
text,
time,
witch,
wreq,
}:
mkDerivation {
pname = "rssfeeds";
@ -47,6 +48,7 @@ mkDerivation {
text
time
witch
wreq
];
license = "unknown";
}

View file

@ -1,2 +1,2 @@
cradle:
cabal:
cabal: {}

View file

@ -74,6 +74,25 @@ executable mail2rss
default-language: GHC2021
executable folders2rss
import: common-options
main-is: Folders.hs
build-depends:
, base
, containers
, extra
, feed >=1.3.0.0
, filepattern
, relude
, string-interpolate
, tagsoup
, text
, time
, witch
, wreq
default-language: GHC2021
executable mastodon2rss
import: common-options
main-is: Mastodon.hs