ft-udvalg: Download udvalgsmembers from folketing.dk as ODS.
This commit is contained in:
parent
fb1f3af984
commit
3be8b564cb
28
Makefile
28
Makefile
|
@ -1,23 +1,23 @@
|
|||
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
|
||||
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
|
||||
duplicate-packets em emoticons encdir fanspeed field \
|
||||
find-first-fail find-optimal forever fxkill G gitnext gitundo \
|
||||
goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
|
||||
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
|
||||
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
|
||||
swapout T teetime timestamp tracefile transpose upsidedown \
|
||||
vid w4it-for-port-open whitehash wifi-reload wssh \
|
||||
youtube-lbry ytv yyyymmdd
|
||||
find-first-fail find-optimal forever ft-udvalg fxkill G \
|
||||
gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf \
|
||||
neno not off pdfman pidcmd pidtree plotpipe puniq ramusage \
|
||||
rand rclean rina rn rrm seekmaniac shython sound-reload \
|
||||
splitvideo stdout swapout T teetime timestamp tracefile \
|
||||
transpose upsidedown vid w4it-for-port-open whitehash \
|
||||
wifi-reload wssh youtube-lbry ytv yyyymmdd
|
||||
|
||||
all: 2search/2grep.1 2search/2search.1 blink/blink.1 \
|
||||
burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1 \
|
||||
drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1 \
|
||||
find-first-fail/find-first-fail.1 find-optimal/find-optimal.1 \
|
||||
G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \
|
||||
goodpasswd/goodpasswd.1 histogram/histogram.1 \
|
||||
mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \
|
||||
pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1 \
|
||||
puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
|
||||
seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||
ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1 \
|
||||
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
|
||||
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
|
||||
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
|
||||
plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
|
||||
rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
|
||||
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
|
||||
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \
|
||||
|
|
4
README
4
README
|
@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail.
|
|||
|
||||
forever - run the same command or list of commands every second.
|
||||
|
||||
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS.
|
||||
|
||||
G - shorthand for multi level grep.
|
||||
|
||||
gitedit - edit last 10 commits.
|
||||
|
||||
gitnext - checkout next revision. Opposite of 'checkout HEAD^'.
|
||||
|
||||
gitundo - undo commit.
|
||||
|
|
200
ft-udvalg/ft-udvalg
Executable file
200
ft-udvalg/ft-udvalg
Executable file
|
@ -0,0 +1,200 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
"""
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS
|
||||
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<ft-udvalg>
|
||||
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<ft-udvalg> will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU,
|
||||
FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO,
|
||||
URU, UUI, ULØ, and UVP, select all the members, add their email
|
||||
addresses, and put it in and ODS-file that is easy to use with Auto
|
||||
Filter.
|
||||
|
||||
ft.dk requires your IP address to be from Denmark. Otherwise you will
|
||||
be blocked by CloudFlare.
|
||||
|
||||
=head1 EXAMPLE
|
||||
|
||||
Generate ft-udvalgsmedlemmer.ods:
|
||||
|
||||
ft-udvalg
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2024 Ole Tange,
|
||||
http://ole.tange.dk and Free Software Foundation, Inc.
|
||||
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 3 of the License, or
|
||||
at your option any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
B<ft-udvalg> uses B<python3>, and a number of Python modules.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
B<python3>
|
||||
|
||||
=cut
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import requests
|
||||
import requests_cache
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import PyPDF2
|
||||
import re
|
||||
|
||||
# Enable logging for requests-cache
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
# Initialize the cache
|
||||
cache_dir = os.path.expanduser("~/.cache/ft-udvalg")
|
||||
requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400) # Cache expires after 1 day
|
||||
|
||||
base_url = "https://www.ft.dk"
|
||||
|
||||
udvalg = [
|
||||
"reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu",
|
||||
"gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau",
|
||||
"sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp"
|
||||
]
|
||||
|
||||
# Step 1: Extract member links from the provided URL
|
||||
def extract_members(udvalg_url):
|
||||
response = requests.get(udvalg_url)
|
||||
logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}")
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
members = []
|
||||
|
||||
for td_tag in soup.find_all('td', {'data-title': 'Navn'}):
|
||||
a_tag = td_tag.find('a', href=True)
|
||||
if a_tag:
|
||||
url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href']
|
||||
members.append({"biopage": url})
|
||||
return members
|
||||
|
||||
# Step 2: Extract the name and PDF URL for each member
|
||||
def extract_pdf_url(member_url):
|
||||
response = requests.get(member_url)
|
||||
logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}")
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
name = soup.find('h1', class_='biography-page-title').text.strip()
|
||||
match = re.match(r'^(.*)\s\((.*)\)$', name)
|
||||
if match:
|
||||
name, party = match.groups()
|
||||
else:
|
||||
raise ValueError("Text format does not match 'Name (Party)'")
|
||||
pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None)
|
||||
|
||||
if pdf_url and not pdf_url.startswith(base_url):
|
||||
pdf_url = base_url + pdf_url
|
||||
|
||||
return {'Navn': name, 'Parti': party, 'CV': pdf_url}
|
||||
|
||||
# Step 3: Extract email from the PDF
|
||||
def extract_email_from_pdf(member):
|
||||
pdf_url = member["CV"]
|
||||
if not pdf_url:
|
||||
return None
|
||||
|
||||
try:
|
||||
response = requests.get(pdf_url)
|
||||
logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}")
|
||||
pdf_path = 'temp.pdf'
|
||||
|
||||
with open(pdf_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
||||
reader = PyPDF2.PdfFileReader(pdf_path)
|
||||
email = None
|
||||
|
||||
for page_num in range(reader.numPages):
|
||||
# Replace \xad with -
|
||||
text = (reader.getPage(page_num).extract_text()).replace('\xad', '-')
|
||||
email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
|
||||
if email_match:
|
||||
email = email_match.group(1)
|
||||
break
|
||||
|
||||
return email
|
||||
|
||||
except PyPDF2.errors.PdfReadError as e:
|
||||
logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}")
|
||||
return None
|
||||
finally:
|
||||
if os.path.exists(pdf_path):
|
||||
os.remove(pdf_path)
|
||||
|
||||
# Process members from each committee
|
||||
udv_members = {}
|
||||
for udv in udvalg:
|
||||
udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt"
|
||||
udv_members[udv] = extract_members(udvalg_url)
|
||||
|
||||
# Consolidate members
|
||||
members = {}
|
||||
for udv, member_list in udv_members.items():
|
||||
for member in member_list:
|
||||
if member["biopage"] not in members:
|
||||
members[member["biopage"]] = {"biopage": member["biopage"]}
|
||||
members[member["biopage"]][udv.upper()] = "X" # Mark membership
|
||||
|
||||
# Extract additional data for each unique member
|
||||
for member in members.values():
|
||||
pdf_data = extract_pdf_url(member["biopage"])
|
||||
member.update(pdf_data)
|
||||
member['Email'] = extract_email_from_pdf(member) if member["CV"] else None
|
||||
|
||||
# Convert the members dictionary to a list of dictionaries
|
||||
members_list = list(members.values())
|
||||
|
||||
# Define the column order
|
||||
sorted_udvalg = sorted(udvalg)
|
||||
columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg]
|
||||
|
||||
# Step 4: Save the extracted data to an ODS file
|
||||
df = pd.DataFrame(members_list)
|
||||
|
||||
# Reorder columns
|
||||
df = df.reindex(columns=columns_order)
|
||||
|
||||
df.to_excel('ft-udvalgsmedlemmer.ods', index=False)
|
||||
|
||||
print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")
|
Loading…
Reference in a new issue