ft-udvalg: Download udvalgsmembers from folketing.dk as ODS.

This commit is contained in:
Ole Tange 2024-07-15 16:03:54 +02:00
parent fb1f3af984
commit 3be8b564cb
3 changed files with 218 additions and 14 deletions

View file

@ -1,23 +1,23 @@
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
duplicate-packets em emoticons encdir fanspeed field \
find-first-fail find-optimal forever fxkill G gitnext gitundo \
goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
swapout T teetime timestamp tracefile transpose upsidedown \
vid w4it-for-port-open whitehash wifi-reload wssh \
youtube-lbry ytv yyyymmdd
find-first-fail find-optimal forever ft-udvalg fxkill G \
gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf \
neno not off pdfman pidcmd pidtree plotpipe puniq ramusage \
rand rclean rina rn rrm seekmaniac shython sound-reload \
splitvideo stdout swapout T teetime timestamp tracefile \
transpose upsidedown vid w4it-for-port-open whitehash \
wifi-reload wssh youtube-lbry ytv yyyymmdd
all: 2search/2grep.1 2search/2search.1 blink/blink.1 \
burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1 \
drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1 \
find-first-fail/find-first-fail.1 find-optimal/find-optimal.1 \
G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \
goodpasswd/goodpasswd.1 histogram/histogram.1 \
mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \
pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1 \
puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
seekmaniac/seekmaniac.1 shython/shython.1 \
ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1 \
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \

4
README
View file

@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail.
forever - run the same command or list of commands every second.
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS.
G - shorthand for multi level grep.
gitedit - edit last 10 commits.
gitnext - checkout next revision. Opposite of 'checkout HEAD^'.
gitundo - undo commit.

200
ft-udvalg/ft-udvalg Executable file
View file

@ -0,0 +1,200 @@
#!/usr/bin/python3
"""
=pod
=encoding UTF-8
=head1 NAME
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS
=head1 SYNOPSIS
B<ft-udvalg>
=head1 DESCRIPTION
B<ft-udvalg> will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU,
FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO,
URU, UUI, ULØ, and UVP, select all the members, add their email
addresses, and put it in and ODS-file that is easy to use with Auto
Filter.
ft.dk requires your IP address to be from Denmark. Otherwise you will
be blocked by CloudFlare.
=head1 EXAMPLE
Generate ft-udvalgsmedlemmer.ods:
ft-udvalg
=head1 AUTHOR
Copyright (C) 2024 Ole Tange,
http://ole.tange.dk and Free Software Foundation, Inc.
=head1 LICENSE
Copyright (C) 2012 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
at your option any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 DEPENDENCIES
B<ft-udvalg> uses B<python3>, and a number of Python modules.
=head1 SEE ALSO
B<python3>
=cut
"""
import os
import logging
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd
import PyPDF2
import re
# Enable logging for requests-cache
logging.basicConfig(level=logging.DEBUG)
# Initialize the cache
cache_dir = os.path.expanduser("~/.cache/ft-udvalg")
requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400) # Cache expires after 1 day
base_url = "https://www.ft.dk"
udvalg = [
"reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu",
"gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau",
"sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp"
]
# Step 1: Extract member links from the provided URL
def extract_members(udvalg_url):
response = requests.get(udvalg_url)
logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}")
soup = BeautifulSoup(response.text, 'html.parser')
members = []
for td_tag in soup.find_all('td', {'data-title': 'Navn'}):
a_tag = td_tag.find('a', href=True)
if a_tag:
url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href']
members.append({"biopage": url})
return members
# Step 2: Extract the name and PDF URL for each member
def extract_pdf_url(member_url):
response = requests.get(member_url)
logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}")
soup = BeautifulSoup(response.text, 'html.parser')
name = soup.find('h1', class_='biography-page-title').text.strip()
match = re.match(r'^(.*)\s\((.*)\)$', name)
if match:
name, party = match.groups()
else:
raise ValueError("Text format does not match 'Name (Party)'")
pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None)
if pdf_url and not pdf_url.startswith(base_url):
pdf_url = base_url + pdf_url
return {'Navn': name, 'Parti': party, 'CV': pdf_url}
# Step 3: Extract email from the PDF
def extract_email_from_pdf(member):
pdf_url = member["CV"]
if not pdf_url:
return None
try:
response = requests.get(pdf_url)
logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}")
pdf_path = 'temp.pdf'
with open(pdf_path, 'wb') as file:
file.write(response.content)
reader = PyPDF2.PdfFileReader(pdf_path)
email = None
for page_num in range(reader.numPages):
# Replace \xad with -
text = (reader.getPage(page_num).extract_text()).replace('\xad', '-')
email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
if email_match:
email = email_match.group(1)
break
return email
except PyPDF2.errors.PdfReadError as e:
logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}")
return None
except Exception as e:
logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}")
return None
finally:
if os.path.exists(pdf_path):
os.remove(pdf_path)
# Process members from each committee
udv_members = {}
for udv in udvalg:
udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt"
udv_members[udv] = extract_members(udvalg_url)
# Consolidate members
members = {}
for udv, member_list in udv_members.items():
for member in member_list:
if member["biopage"] not in members:
members[member["biopage"]] = {"biopage": member["biopage"]}
members[member["biopage"]][udv.upper()] = "X" # Mark membership
# Extract additional data for each unique member
for member in members.values():
pdf_data = extract_pdf_url(member["biopage"])
member.update(pdf_data)
member['Email'] = extract_email_from_pdf(member) if member["CV"] else None
# Convert the members dictionary to a list of dictionaries
members_list = list(members.values())
# Define the column order
sorted_udvalg = sorted(udvalg)
columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg]
# Step 4: Save the extracted data to an ODS file
df = pd.DataFrame(members_list)
# Reorder columns
df = df.reindex(columns=columns_order)
df.to_excel('ft-udvalgsmedlemmer.ods', index=False)
print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")