From 3e2254d676193ef6c628250617c675b9ff724b4c Mon Sep 17 00:00:00 2001 From: Laria Carolin Chabowski Date: Wed, 11 Mar 2020 22:35:27 +0100 Subject: Initial commit falseknees-atom.lua is a simple scraper of falseknees.com/archive that outputs an atom feed for consumption by newsboat or similar newsreaders that can get a feed from some external command. --- falseknees-atom.lua | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100755 falseknees-atom.lua (limited to 'falseknees-atom.lua') diff --git a/falseknees-atom.lua b/falseknees-atom.lua new file mode 100755 index 0000000..584617c --- /dev/null +++ b/falseknees-atom.lua @@ -0,0 +1,120 @@ +#!/usr/bin/lua + +local string = require "string" +local http = require "socket.http" + +local function must_get_page(url) + local data, status = http.request(url) + if status ~= 200 then + error("Failed getting '" .. url .. "': Got status " .. status .. "\n") + end + return data +end + +local function xmlesc(text) + text = string.gsub(text, "&", "&") + text = string.gsub(text, "<", "<") + text = string.gsub(text, ">", ">") + text = string.gsub(text, '"', """) + text = string.gsub(text, "'", "'") + return text +end + +local function unesc_html(s) + return string.gsub(s, '&([^;]+);', { + amp = "&", + lt = "<", + gt = ">", + quot = '"', + apos = "'", + }) +end + +local months = { + January = 1, + February = 2, + March = 3, + April = 4, + May = 5, + June = 6, + July = 7, + August = 8, + September = 9, + October = 10, + November = 11, + December = 12, +} + +-- Get the permanent URL of the current comic (index.html) +local function resolve_index() + local indexsrc = must_get_page("https://falseknees.com/index.html") + local url = string.match(indexsrc, '') + return unesc_html(url) +end + +local pagesrc = must_get_page("https://falseknees.com/archive.html") + +local maxdate +local entries = {} + +local PATTERN = '(%w+) (%d+)%w+, (%d+)%s*%-%s*([^<>]-)' +for url, monthname, day, year, title in string.gmatch(pagesrc, PATTERN) do + url = unesc_html(url) + monthname = unesc_html(monthname) + day = tonumber(unesc_html(day), 10) + year = tonumber(unesc_html(year), 10) + title = unesc_html(title) + + if url == "index.html" then + url = resolve_index() + end + + if not url then goto continue end + + url = "https://falseknees.com/" .. url + + local month = assert(months[monthname], "unknown month") + + -- We fon't know the time, lets assume midnight at UTC+0 + local date = string.format("%04d-%02d-%02dT00:00:00Z", year, month, day) + + if not maxdate or date > maxdate then + maxdate = date + end + + entries[#entries+1] = { + url = url, + title = title, + date = date + } + + ::continue:: +end + +if not maxdate then + error("No entry successfully parsed") +end + +-- A UUID URN with a UUID generated by `uuid -r`. +local FEED_ID = "urn:uuid:7e86b012-c226-41b3-945e-bab3341c65e9" + +print(string.format([[ + + False Knees + + %s + Joshua Barkman + %s]], xmlesc(FEED_ID), xmlesc(maxdate))) + +for _, entry in ipairs(entries) do + print(string.format([[ + + %s + + %s + %s + + ]], xmlesc(entry.title), xmlesc(entry.url), xmlesc(FEED_ID .. "#" .. entry.url), xmlesc(entry.date))) +end + +print("") -- cgit v1.2.3-54-g00ecf