ft-udvalg: Download udvalgsmembers from folketing.dk as ODS.
This commit is contained in:
parent
fb1f3af984
commit
3be8b564cb
28
Makefile
28
Makefile
|
@ -1,23 +1,23 @@
|
||||||
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
|
CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac \
|
||||||
duplicate-packets em emoticons encdir fanspeed field \
|
duplicate-packets em emoticons encdir fanspeed field \
|
||||||
find-first-fail find-optimal forever fxkill G gitnext gitundo \
|
find-first-fail find-optimal forever ft-udvalg fxkill G \
|
||||||
goodpasswd histogram Loffice mtrr mirrorpdf neno not off \
|
gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf \
|
||||||
pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean \
|
neno not off pdfman pidcmd pidtree plotpipe puniq ramusage \
|
||||||
rina rn rrm seekmaniac shython sound-reload splitvideo stdout \
|
rand rclean rina rn rrm seekmaniac shython sound-reload \
|
||||||
swapout T teetime timestamp tracefile transpose upsidedown \
|
splitvideo stdout swapout T teetime timestamp tracefile \
|
||||||
vid w4it-for-port-open whitehash wifi-reload wssh \
|
transpose upsidedown vid w4it-for-port-open whitehash \
|
||||||
youtube-lbry ytv yyyymmdd
|
wifi-reload wssh youtube-lbry ytv yyyymmdd
|
||||||
|
|
||||||
all: 2search/2grep.1 2search/2search.1 blink/blink.1 \
|
all: 2search/2grep.1 2search/2search.1 blink/blink.1 \
|
||||||
burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1 \
|
burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1 \
|
||||||
drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1 \
|
drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1 \
|
||||||
find-first-fail/find-first-fail.1 find-optimal/find-optimal.1 \
|
find-first-fail/find-first-fail.1 find-optimal/find-optimal.1 \
|
||||||
G/G.1 gitnext/gitnext.1 gitundo/gitundo.1 \
|
ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1 \
|
||||||
goodpasswd/goodpasswd.1 histogram/histogram.1 \
|
gitundo/gitundo.1 goodpasswd/goodpasswd.1 \
|
||||||
mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1 \
|
histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1 \
|
||||||
pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1 \
|
off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1 \
|
||||||
puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1 \
|
plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1 \
|
||||||
seekmaniac/seekmaniac.1 shython/shython.1 \
|
rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1 \
|
||||||
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
|
sound-reload/sound-reload.1 splitvideo/splitvideo.1 \
|
||||||
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
|
stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1 \
|
||||||
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \
|
tracefile/tracefile.1 transpose/transpose.1 T/T.1 \
|
||||||
|
|
4
README
4
README
|
@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail.
|
||||||
|
|
||||||
forever - run the same command or list of commands every second.
|
forever - run the same command or list of commands every second.
|
||||||
|
|
||||||
|
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS.
|
||||||
|
|
||||||
G - shorthand for multi level grep.
|
G - shorthand for multi level grep.
|
||||||
|
|
||||||
|
gitedit - edit last 10 commits.
|
||||||
|
|
||||||
gitnext - checkout next revision. Opposite of 'checkout HEAD^'.
|
gitnext - checkout next revision. Opposite of 'checkout HEAD^'.
|
||||||
|
|
||||||
gitundo - undo commit.
|
gitundo - undo commit.
|
||||||
|
|
200
ft-udvalg/ft-udvalg
Executable file
200
ft-udvalg/ft-udvalg
Executable file
|
@ -0,0 +1,200 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
=pod
|
||||||
|
|
||||||
|
=encoding UTF-8
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
ft-udvalg - Download udvalgsmembers from folketing.dk as ODS
|
||||||
|
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
B<ft-udvalg>
|
||||||
|
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
B<ft-udvalg> will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU,
|
||||||
|
FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO,
|
||||||
|
URU, UUI, ULØ, and UVP, select all the members, add their email
|
||||||
|
addresses, and put it in and ODS-file that is easy to use with Auto
|
||||||
|
Filter.
|
||||||
|
|
||||||
|
ft.dk requires your IP address to be from Denmark. Otherwise you will
|
||||||
|
be blocked by CloudFlare.
|
||||||
|
|
||||||
|
=head1 EXAMPLE
|
||||||
|
|
||||||
|
Generate ft-udvalgsmedlemmer.ods:
|
||||||
|
|
||||||
|
ft-udvalg
|
||||||
|
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Copyright (C) 2024 Ole Tange,
|
||||||
|
http://ole.tange.dk and Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 LICENSE
|
||||||
|
|
||||||
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 3 of the License, or
|
||||||
|
at your option any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 DEPENDENCIES
|
||||||
|
|
||||||
|
B<ft-udvalg> uses B<python3>, and a number of Python modules.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 SEE ALSO
|
||||||
|
|
||||||
|
B<python3>
|
||||||
|
|
||||||
|
=cut
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
import requests_cache
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import pandas as pd
|
||||||
|
import PyPDF2
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Enable logging for requests-cache
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
# Initialize the cache
|
||||||
|
cache_dir = os.path.expanduser("~/.cache/ft-udvalg")
|
||||||
|
requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400) # Cache expires after 1 day
|
||||||
|
|
||||||
|
base_url = "https://www.ft.dk"
|
||||||
|
|
||||||
|
udvalg = [
|
||||||
|
"reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu",
|
||||||
|
"gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau",
|
||||||
|
"sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Step 1: Extract member links from the provided URL
|
||||||
|
def extract_members(udvalg_url):
|
||||||
|
response = requests.get(udvalg_url)
|
||||||
|
logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}")
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
members = []
|
||||||
|
|
||||||
|
for td_tag in soup.find_all('td', {'data-title': 'Navn'}):
|
||||||
|
a_tag = td_tag.find('a', href=True)
|
||||||
|
if a_tag:
|
||||||
|
url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href']
|
||||||
|
members.append({"biopage": url})
|
||||||
|
return members
|
||||||
|
|
||||||
|
# Step 2: Extract the name and PDF URL for each member
|
||||||
|
def extract_pdf_url(member_url):
|
||||||
|
response = requests.get(member_url)
|
||||||
|
logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}")
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
name = soup.find('h1', class_='biography-page-title').text.strip()
|
||||||
|
match = re.match(r'^(.*)\s\((.*)\)$', name)
|
||||||
|
if match:
|
||||||
|
name, party = match.groups()
|
||||||
|
else:
|
||||||
|
raise ValueError("Text format does not match 'Name (Party)'")
|
||||||
|
pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None)
|
||||||
|
|
||||||
|
if pdf_url and not pdf_url.startswith(base_url):
|
||||||
|
pdf_url = base_url + pdf_url
|
||||||
|
|
||||||
|
return {'Navn': name, 'Parti': party, 'CV': pdf_url}
|
||||||
|
|
||||||
|
# Step 3: Extract email from the PDF
|
||||||
|
def extract_email_from_pdf(member):
|
||||||
|
pdf_url = member["CV"]
|
||||||
|
if not pdf_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(pdf_url)
|
||||||
|
logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}")
|
||||||
|
pdf_path = 'temp.pdf'
|
||||||
|
|
||||||
|
with open(pdf_path, 'wb') as file:
|
||||||
|
file.write(response.content)
|
||||||
|
|
||||||
|
reader = PyPDF2.PdfFileReader(pdf_path)
|
||||||
|
email = None
|
||||||
|
|
||||||
|
for page_num in range(reader.numPages):
|
||||||
|
# Replace \xad with -
|
||||||
|
text = (reader.getPage(page_num).extract_text()).replace('\xad', '-')
|
||||||
|
email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
|
||||||
|
if email_match:
|
||||||
|
email = email_match.group(1)
|
||||||
|
break
|
||||||
|
|
||||||
|
return email
|
||||||
|
|
||||||
|
except PyPDF2.errors.PdfReadError as e:
|
||||||
|
logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
if os.path.exists(pdf_path):
|
||||||
|
os.remove(pdf_path)
|
||||||
|
|
||||||
|
# Process members from each committee
|
||||||
|
udv_members = {}
|
||||||
|
for udv in udvalg:
|
||||||
|
udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt"
|
||||||
|
udv_members[udv] = extract_members(udvalg_url)
|
||||||
|
|
||||||
|
# Consolidate members
|
||||||
|
members = {}
|
||||||
|
for udv, member_list in udv_members.items():
|
||||||
|
for member in member_list:
|
||||||
|
if member["biopage"] not in members:
|
||||||
|
members[member["biopage"]] = {"biopage": member["biopage"]}
|
||||||
|
members[member["biopage"]][udv.upper()] = "X" # Mark membership
|
||||||
|
|
||||||
|
# Extract additional data for each unique member
|
||||||
|
for member in members.values():
|
||||||
|
pdf_data = extract_pdf_url(member["biopage"])
|
||||||
|
member.update(pdf_data)
|
||||||
|
member['Email'] = extract_email_from_pdf(member) if member["CV"] else None
|
||||||
|
|
||||||
|
# Convert the members dictionary to a list of dictionaries
|
||||||
|
members_list = list(members.values())
|
||||||
|
|
||||||
|
# Define the column order
|
||||||
|
sorted_udvalg = sorted(udvalg)
|
||||||
|
columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg]
|
||||||
|
|
||||||
|
# Step 4: Save the extracted data to an ODS file
|
||||||
|
df = pd.DataFrame(members_list)
|
||||||
|
|
||||||
|
# Reorder columns
|
||||||
|
df = df.reindex(columns=columns_order)
|
||||||
|
|
||||||
|
df.to_excel('ft-udvalgsmedlemmer.ods', index=False)
|
||||||
|
|
||||||
|
print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")
|
Loading…
Reference in a new issue