ft-udvalg: Download udvalgsmembers from folketing.dk as ODS.

2024-07-15 16:03:54 +02:00 · 2024-07-15 16:03:54 +02:00 · 3be8b564cb
parent fb1f3af984
commit 3be8b564cb
3 changed files with 218 additions and 14 deletions
--- a/28
+++ b/28
@ -1,23 +1,23 @@
-CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac		\
+CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac	\
 	duplicate-packets em emoticons encdir fanspeed field		\
-	find-first-fail find-optimal forever fxkill G gitnext gitundo	\
-	goodpasswd histogram Loffice mtrr mirrorpdf neno not off	\
-	pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean	\
-	rina rn rrm seekmaniac shython sound-reload splitvideo stdout	\
-	swapout T teetime timestamp tracefile transpose upsidedown	\
-	vid w4it-for-port-open whitehash wifi-reload wssh		\
-	youtube-lbry ytv yyyymmdd
+	find-first-fail find-optimal forever ft-udvalg fxkill G		\
+	gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf	\
+	neno not off pdfman pidcmd pidtree plotpipe puniq ramusage	\
+	rand rclean rina rn rrm seekmaniac shython sound-reload		\
+	splitvideo stdout swapout T teetime timestamp tracefile		\
+	transpose upsidedown vid w4it-for-port-open whitehash		\
+	wifi-reload wssh youtube-lbry ytv yyyymmdd

 all: 2search/2grep.1 2search/2search.1 blink/blink.1			\
 	burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1	\
 	drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1	\
 	find-first-fail/find-first-fail.1 find-optimal/find-optimal.1	\
-	G/G.1 gitnext/gitnext.1 gitundo/gitundo.1			\
-	goodpasswd/goodpasswd.1 histogram/histogram.1			\
-	mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1	\
-	pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1		\
-	puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1		\
-	seekmaniac/seekmaniac.1 shython/shython.1			\
+	ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1			\
+	gitundo/gitundo.1 goodpasswd/goodpasswd.1			\
+	histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1		\
+	off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1	\
+	plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1	\
+	rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1	\
 	sound-reload/sound-reload.1 splitvideo/splitvideo.1		\
 	stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1		\
 	tracefile/tracefile.1 transpose/transpose.1 T/T.1		\
--- a/4
+++ b/4
@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail.

 forever - run the same command or list of commands every second.

+ft-udvalg - Download udvalgsmembers from folketing.dk as ODS.
+
 G - shorthand for multi level grep.

+gitedit - edit last 10 commits.
+
 gitnext - checkout next revision. Opposite of 'checkout HEAD^'.

 gitundo - undo commit.
--- a/ft-udvalg/ft-udvalg
+++ b/ft-udvalg/ft-udvalg
@ -0,0 +1,200 @@
+#!/usr/bin/python3
+
+"""
+=pod
+
+=encoding UTF-8
+
+=head1 NAME
+
+ft-udvalg - Download udvalgsmembers from folketing.dk as ODS
+
+
+=head1 SYNOPSIS
+
+B<ft-udvalg>
+
+
+=head1 DESCRIPTION
+
+B<ft-udvalg> will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU,
+FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO,
+URU, UUI, ULØ, and UVP, select all the members, add their email
+addresses, and put it in and ODS-file that is easy to use with Auto
+Filter.
+
+ft.dk requires your IP address to be from Denmark. Otherwise you will
+be blocked by CloudFlare.
+
+=head1 EXAMPLE
+
+Generate ft-udvalgsmedlemmer.ods:
+
+  ft-udvalg
+
+
+=head1 AUTHOR
+
+Copyright (C) 2024 Ole Tange,
+http://ole.tange.dk and Free Software Foundation, Inc.
+
+
+=head1 LICENSE
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+at your option any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+=head1 DEPENDENCIES
+
+B<ft-udvalg> uses B<python3>, and a number of Python modules.
+
+
+=head1 SEE ALSO
+
+B<python3>
+
+=cut
+"""
+
+import os
+import logging
+import requests
+import requests_cache
+from bs4 import BeautifulSoup
+import pandas as pd
+import PyPDF2
+import re
+
+# Enable logging for requests-cache
+logging.basicConfig(level=logging.DEBUG)
+
+# Initialize the cache
+cache_dir = os.path.expanduser("~/.cache/ft-udvalg")
+requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400)  # Cache expires after 1 day
+
+base_url = "https://www.ft.dk"
+
+udvalg = [
+    "reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu",
+    "gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau",
+    "sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp"
+]
+
+# Step 1: Extract member links from the provided URL
+def extract_members(udvalg_url):
+    response = requests.get(udvalg_url)
+    logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}")
+    soup = BeautifulSoup(response.text, 'html.parser')
+    members = []
+    
+    for td_tag in soup.find_all('td', {'data-title': 'Navn'}):
+        a_tag = td_tag.find('a', href=True)
+        if a_tag:
+            url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href']
+            members.append({"biopage": url})
+    return members
+
+# Step 2: Extract the name and PDF URL for each member
+def extract_pdf_url(member_url):
+    response = requests.get(member_url)
+    logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}")
+    soup = BeautifulSoup(response.text, 'html.parser')
+    name = soup.find('h1', class_='biography-page-title').text.strip()
+    match = re.match(r'^(.*)\s\((.*)\)$', name)
+    if match:
+        name, party = match.groups()
+    else:
+        raise ValueError("Text format does not match 'Name (Party)'")
+    pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None)
+
+    if pdf_url and not pdf_url.startswith(base_url):
+        pdf_url = base_url + pdf_url
+
+    return {'Navn': name, 'Parti': party, 'CV': pdf_url}
+
+# Step 3: Extract email from the PDF
+def extract_email_from_pdf(member):
+    pdf_url = member["CV"]
+    if not pdf_url:
+        return None
+
+    try:
+        response = requests.get(pdf_url)
+        logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}")
+        pdf_path = 'temp.pdf'
+
+        with open(pdf_path, 'wb') as file:
+            file.write(response.content)
+
+        reader = PyPDF2.PdfFileReader(pdf_path)
+        email = None
+
+        for page_num in range(reader.numPages):
+            # Replace \xad with -
+            text = (reader.getPage(page_num).extract_text()).replace('\xad', '-')
+            email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
+            if email_match:
+                email = email_match.group(1)
+                break
+
+        return email
+
+    except PyPDF2.errors.PdfReadError as e:
+        logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}")
+        return None
+    except Exception as e:
+        logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}")
+        return None
+    finally:
+        if os.path.exists(pdf_path):
+            os.remove(pdf_path)
+
+# Process members from each committee
+udv_members = {}
+for udv in udvalg:
+    udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt"
+    udv_members[udv] = extract_members(udvalg_url)
+
+# Consolidate members
+members = {}
+for udv, member_list in udv_members.items():
+    for member in member_list:
+        if member["biopage"] not in members:
+            members[member["biopage"]] = {"biopage": member["biopage"]}
+        members[member["biopage"]][udv.upper()] = "X"  # Mark membership
+
+# Extract additional data for each unique member
+for member in members.values():
+    pdf_data = extract_pdf_url(member["biopage"])
+    member.update(pdf_data)
+    member['Email'] = extract_email_from_pdf(member) if member["CV"] else None
+
+# Convert the members dictionary to a list of dictionaries
+members_list = list(members.values())
+
+# Define the column order
+sorted_udvalg = sorted(udvalg)
+columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg]
+
+# Step 4: Save the extracted data to an ODS file
+df = pd.DataFrame(members_list)
+
+# Reorder columns
+df = df.reindex(columns=columns_order)
+
+df.to_excel('ft-udvalgsmedlemmer.ods', index=False)
+
+print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")