From 3be8b564cbbeeb3a2e3b215be192d3cd111df011 Mon Sep 17 00:00:00 2001 From: Ole Tange Date: Mon, 15 Jul 2024 16:03:54 +0200 Subject: [PATCH] ft-udvalg: Download udvalgsmembers from folketing.dk as ODS. --- Makefile | 28 +++---- README | 4 + ft-udvalg/ft-udvalg | 200 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 14 deletions(-) create mode 100755 ft-udvalg/ft-udvalg diff --git a/Makefile b/Makefile index 44fbce3..dfe2388 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,23 @@ -CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \ +CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \ duplicate-packets em emoticons encdir fanspeed field \ - find-first-fail find-optimal forever fxkill G gitnext gitundo \ - goodpasswd histogram Loffice mtrr mirrorpdf neno not off \ - pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \ - rina rn rrm seekmaniac shython sound-reload splitvideo stdout \ - swapout T teetime timestamp tracefile transpose upsidedown \ - vid w4it-for-port-open whitehash wifi-reload wssh \ - youtube-lbry ytv yyyymmdd + find-first-fail find-optimal forever ft-udvalg fxkill G \ + gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf \ + neno not off pdfman pidcmd pidtree plotpipe puniq ramusage \ + rand rclean rina rn rrm seekmaniac shython sound-reload \ + splitvideo stdout swapout T teetime timestamp tracefile \ + transpose upsidedown vid w4it-for-port-open whitehash \ + wifi-reload wssh youtube-lbry ytv yyyymmdd all: 2search/2grep.1 2search/2search.1 blink/blink.1 \ burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1 \ drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1 \ find-first-fail/find-first-fail.1 find-optimal/find-optimal.1 \ - G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \ - goodpasswd/goodpasswd.1 histogram/histogram.1 \ - mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \ - pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1 \ - puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \ - seekmaniac/seekmaniac.1 shython/shython.1 \ + ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1 \ + gitundo/gitundo.1 goodpasswd/goodpasswd.1 \ + histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \ + off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \ + plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \ + rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \ sound-reload/sound-reload.1 splitvideo/splitvideo.1 \ stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \ tracefile/tracefile.1 transpose/transpose.1 T/T.1 \ diff --git a/README b/README index 60f457c..216dbe5 100644 --- a/README +++ b/README @@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail. forever - run the same command or list of commands every second. +ft-udvalg - Download udvalgsmembers from folketing.dk as ODS. + G - shorthand for multi level grep. +gitedit - edit last 10 commits. + gitnext - checkout next revision. Opposite of 'checkout HEAD^'. gitundo - undo commit. diff --git a/ft-udvalg/ft-udvalg b/ft-udvalg/ft-udvalg new file mode 100755 index 0000000..6d3becf --- /dev/null +++ b/ft-udvalg/ft-udvalg @@ -0,0 +1,200 @@ +#!/usr/bin/python3 + +""" +=pod + +=encoding UTF-8 + +=head1 NAME + +ft-udvalg - Download udvalgsmembers from folketing.dk as ODS + + +=head1 SYNOPSIS + +B + + +=head1 DESCRIPTION + +B will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU, +FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO, +URU, UUI, ULØ, and UVP, select all the members, add their email +addresses, and put it in and ODS-file that is easy to use with Auto +Filter. + +ft.dk requires your IP address to be from Denmark. Otherwise you will +be blocked by CloudFlare. + +=head1 EXAMPLE + +Generate ft-udvalgsmedlemmer.ods: + + ft-udvalg + + +=head1 AUTHOR + +Copyright (C) 2024 Ole Tange, +http://ole.tange.dk and Free Software Foundation, Inc. + + +=head1 LICENSE + +Copyright (C) 2012 Free Software Foundation, Inc. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +at your option any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + + +=head1 DEPENDENCIES + +B uses B, and a number of Python modules. + + +=head1 SEE ALSO + +B + +=cut +""" + +import os +import logging +import requests +import requests_cache +from bs4 import BeautifulSoup +import pandas as pd +import PyPDF2 +import re + +# Enable logging for requests-cache +logging.basicConfig(level=logging.DEBUG) + +# Initialize the cache +cache_dir = os.path.expanduser("~/.cache/ft-udvalg") +requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400) # Cache expires after 1 day + +base_url = "https://www.ft.dk" + +udvalg = [ + "reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu", + "gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau", + "sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp" +] + +# Step 1: Extract member links from the provided URL +def extract_members(udvalg_url): + response = requests.get(udvalg_url) + logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}") + soup = BeautifulSoup(response.text, 'html.parser') + members = [] + + for td_tag in soup.find_all('td', {'data-title': 'Navn'}): + a_tag = td_tag.find('a', href=True) + if a_tag: + url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href'] + members.append({"biopage": url}) + return members + +# Step 2: Extract the name and PDF URL for each member +def extract_pdf_url(member_url): + response = requests.get(member_url) + logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}") + soup = BeautifulSoup(response.text, 'html.parser') + name = soup.find('h1', class_='biography-page-title').text.strip() + match = re.match(r'^(.*)\s\((.*)\)$', name) + if match: + name, party = match.groups() + else: + raise ValueError("Text format does not match 'Name (Party)'") + pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None) + + if pdf_url and not pdf_url.startswith(base_url): + pdf_url = base_url + pdf_url + + return {'Navn': name, 'Parti': party, 'CV': pdf_url} + +# Step 3: Extract email from the PDF +def extract_email_from_pdf(member): + pdf_url = member["CV"] + if not pdf_url: + return None + + try: + response = requests.get(pdf_url) + logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}") + pdf_path = 'temp.pdf' + + with open(pdf_path, 'wb') as file: + file.write(response.content) + + reader = PyPDF2.PdfFileReader(pdf_path) + email = None + + for page_num in range(reader.numPages): + # Replace \xad with - + text = (reader.getPage(page_num).extract_text()).replace('\xad', '-') + email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text) + if email_match: + email = email_match.group(1) + break + + return email + + except PyPDF2.errors.PdfReadError as e: + logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}") + return None + except Exception as e: + logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}") + return None + finally: + if os.path.exists(pdf_path): + os.remove(pdf_path) + +# Process members from each committee +udv_members = {} +for udv in udvalg: + udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt" + udv_members[udv] = extract_members(udvalg_url) + +# Consolidate members +members = {} +for udv, member_list in udv_members.items(): + for member in member_list: + if member["biopage"] not in members: + members[member["biopage"]] = {"biopage": member["biopage"]} + members[member["biopage"]][udv.upper()] = "X" # Mark membership + +# Extract additional data for each unique member +for member in members.values(): + pdf_data = extract_pdf_url(member["biopage"]) + member.update(pdf_data) + member['Email'] = extract_email_from_pdf(member) if member["CV"] else None + +# Convert the members dictionary to a list of dictionaries +members_list = list(members.values()) + +# Define the column order +sorted_udvalg = sorted(udvalg) +columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg] + +# Step 4: Save the extracted data to an ODS file +df = pd.DataFrame(members_list) + +# Reorder columns +df = df.reindex(columns=columns_order) + +df.to_excel('ft-udvalgsmedlemmer.ods', index=False) + +print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")