Add Folder scrape feed
This commit is contained in:
parent
9b25c3dba1
commit
1d7a83b6d5
82
packages/rssfeeds/Folders.hs
Normal file
82
packages/rssfeeds/Folders.hs
Normal file
|
@ -0,0 +1,82 @@
|
|||
module Main (main) where
|
||||
|
||||
import Data.List.Extra qualified as List
|
||||
import Data.String.Interpolate (i)
|
||||
import Data.Text qualified as Text
|
||||
import Data.Time (UTCTime)
|
||||
import Data.Time qualified as Time
|
||||
import Network.Wreq qualified as Wreq
|
||||
import Relude
|
||||
import Relude.Extra ((^.))
|
||||
import Text.Atom.Feed qualified as Feed
|
||||
import Text.Atom.Feed.Export qualified as Feed
|
||||
import Text.HTML.TagSoup qualified as TagSoup
|
||||
import Witch (into)
|
||||
|
||||
extractItem :: [TagSoup.Tag Text] -> Maybe (Text, Text, UTCTime)
|
||||
extractItem x = case ( List.firstJust extractFolder . filter (TagSoup.isTagOpenName "a") $ x
|
||||
, TagSoup.maybeTagText <=< (find TagSoup.isTagText . dropWhile (\tag -> not (TagSoup.isTagOpenName "td" tag && "date" == TagSoup.fromAttrib "class" tag))) $ x
|
||||
) of
|
||||
(Just (link, title), Just date_str) | Just date <- Time.parseTimeM True Time.defaultTimeLocale "%Y-%b-%d %H:%M" (toString date_str) -> Just (link, title, date)
|
||||
_ -> Nothing
|
||||
|
||||
extractIndex :: Text -> IO [(Text, Text, UTCTime)]
|
||||
extractIndex = \url ->
|
||||
mapMaybe extractItem
|
||||
. List.split (TagSoup.isTagOpenName "tr")
|
||||
. dropWhile (not . TagSoup.isTagOpenName "tbody")
|
||||
. TagSoup.parseTags
|
||||
. decodeUtf8
|
||||
. (^. Wreq.responseBody)
|
||||
<$> Wreq.get [i|#{url}|]
|
||||
|
||||
collectEntries :: Text -> (Text, Text, UTCTime) -> IO [(Feed.Entry)]
|
||||
collectEntries = \url (link, title, date) ->
|
||||
if Text.isSuffixOf "/" link
|
||||
then do
|
||||
entries <- extractIndex [i|#{url}#{link}|]
|
||||
join <$> forM entries (collectEntries [i|#{url}#{link}|])
|
||||
else
|
||||
pure
|
||||
[ ( ( Feed.nullEntry
|
||||
(Text.dropAround (== '/') link)
|
||||
(Feed.TextString title)
|
||||
(timestamp date)
|
||||
)
|
||||
{ Feed.entryLinks = [Feed.nullLink [i|#{url}#{link}|]]
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[root_dir] <- getArgs
|
||||
folders <- extractIndex (toText root_dir)
|
||||
feeds <- forM folders \x@(link, title, date) -> do
|
||||
let path = Text.dropAround (== '/') link
|
||||
entries <- collectEntries [i|#{root_dir}/|] x
|
||||
let num = length entries
|
||||
emptyFeed =
|
||||
Feed.nullFeed
|
||||
[i|#{path}-#{timestamp date}|]
|
||||
(Feed.TextString title)
|
||||
(timestamp date)
|
||||
feed = fromMaybe (error "Could not produce feed.") $ Feed.textFeed emptyFeed{Feed.feedEntries = entries}
|
||||
writeFileLText [i|#{title}.xml|] feed
|
||||
pure [i|<li><a href="#{path}.xml">#{title} (#{num} entries, newest from #{timestamp date})</a></li>|]
|
||||
writeFileText [i|index.html|] $
|
||||
Text.unlines $
|
||||
["<!DOCTYPE html>", "<html>", "<head>", "<title>Available RSS Feeds</title>", "</head>", "<body>", "<h1>Available RSS Feeds</h1>", "<ul>"] ++ feeds ++ ["</ul>", "</body>", "</html>"]
|
||||
|
||||
timestamp :: UTCTime -> Text
|
||||
timestamp = into . Time.formatTime Time.defaultTimeLocale "%Y-%m-%d %H:%M"
|
||||
|
||||
extractFolder :: TagSoup.Tag Text -> Maybe (Text, Text)
|
||||
extractFolder = \tag ->
|
||||
let
|
||||
title = TagSoup.fromAttrib "title" tag
|
||||
href = TagSoup.fromAttrib "href" tag
|
||||
in
|
||||
if Text.null title || Text.null href
|
||||
then Nothing
|
||||
else Just (href, title)
|
|
@ -20,6 +20,7 @@
|
|||
text,
|
||||
time,
|
||||
witch,
|
||||
wreq,
|
||||
}:
|
||||
mkDerivation {
|
||||
pname = "rssfeeds";
|
||||
|
@ -47,6 +48,7 @@ mkDerivation {
|
|||
text
|
||||
time
|
||||
witch
|
||||
wreq
|
||||
];
|
||||
license = "unknown";
|
||||
}
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
cradle:
|
||||
cabal:
|
||||
cabal: {}
|
||||
|
|
|
@ -74,6 +74,25 @@ executable mail2rss
|
|||
|
||||
default-language: GHC2021
|
||||
|
||||
executable folders2rss
|
||||
import: common-options
|
||||
main-is: Folders.hs
|
||||
build-depends:
|
||||
, base
|
||||
, containers
|
||||
, extra
|
||||
, feed >=1.3.0.0
|
||||
, filepattern
|
||||
, relude
|
||||
, string-interpolate
|
||||
, tagsoup
|
||||
, text
|
||||
, time
|
||||
, witch
|
||||
, wreq
|
||||
|
||||
default-language: GHC2021
|
||||
|
||||
executable mastodon2rss
|
||||
import: common-options
|
||||
main-is: Mastodon.hs
|
||||
|
|
Loading…
Reference in a new issue