ft-udvalg: Download udvalgsmembers from folketing.dk as ODS.

2024-07-15 16:03:54 +02:00 · 2024-07-15 16:03:54 +02:00 · 3be8b564cb
parent fb1f3af984
commit 3be8b564cb
3 changed files with 218 additions and 14 deletions
--- a/26
+++ b/26
@ -1,23 +1,23 @@
 CMD = 2grep 2search audioping blink burncpu bwlimit clipboard drac	\
 	duplicate-packets em emoticons encdir fanspeed field		\
-	find-first-fail find-optimal forever fxkill G gitnext gitundo	\
+	find-first-fail find-optimal forever ft-udvalg fxkill G		\
-	goodpasswd histogram Loffice mtrr mirrorpdf neno not off	\
+	gitnext gitundo goodpasswd histogram Loffice mtrr mirrorpdf	\
-	pdfman pidcmd pidtree plotpipe puniq ramusage rand rclean	\
+	neno not off pdfman pidcmd pidtree plotpipe puniq ramusage	\
-	rina rn rrm seekmaniac shython sound-reload splitvideo stdout	\
+	rand rclean rina rn rrm seekmaniac shython sound-reload		\
-	swapout T teetime timestamp tracefile transpose upsidedown	\
+	splitvideo stdout swapout T teetime timestamp tracefile		\
-	vid w4it-for-port-open whitehash wifi-reload wssh		\
+	transpose upsidedown vid w4it-for-port-open whitehash		\
-	youtube-lbry ytv yyyymmdd
+	wifi-reload wssh youtube-lbry ytv yyyymmdd
 all: 2search/2grep.1 2search/2search.1 blink/blink.1			\
 	burncpu/burncpu.1 bwlimit/bwlimit.1 clipboard/clipboard.1	\
 	drac/drac.1 encdir/encdir.1 fanspeed/fanspeed.1 field/field.1	\
 	find-first-fail/find-first-fail.1 find-optimal/find-optimal.1	\
-	G/G.1 gitnext/gitnext.1 gitundo/gitundo.1			\
+	ft-udvalg/ft-udvalg.1 G/G.1 gitnext/gitnext.1			\
-	goodpasswd/goodpasswd.1 histogram/histogram.1			\
+	gitundo/gitundo.1 goodpasswd/goodpasswd.1			\
-	mirrorpdf/mirrorpdf.1 neno/neno.1 off/off.1 pdfman/pdfman.1	\
+	histogram/histogram.1 mirrorpdf/mirrorpdf.1 neno/neno.1		\
-	pidcmd/pidcmd.1 pidtree/pidtree.1 plotpipe/plotpipe.1		\
+	off/off.1 pdfman/pdfman.1 pidcmd/pidcmd.1 pidtree/pidtree.1	\
-	puniq/puniq.1 rand/rand.1 rina/rina.1 rn/rn.1 rrm/rrm.1		\
+	plotpipe/plotpipe.1 puniq/puniq.1 rand/rand.1 rina/rina.1	\
-	seekmaniac/seekmaniac.1 shython/shython.1			\
+	rn/rn.1 rrm/rrm.1 seekmaniac/seekmaniac.1 shython/shython.1	\
 	sound-reload/sound-reload.1 splitvideo/splitvideo.1		\
 	stdout/stdout.1 teetime/teetime.1 timestamp/timestamp.1		\
 	tracefile/tracefile.1 transpose/transpose.1 T/T.1		\
--- a/4
+++ b/4
@ -24,8 +24,12 @@ find-first-fail - find the lowest argument that makes a command fail.
 forever - run the same command or list of commands every second.
 ft-udvalg - Download udvalgsmembers from folketing.dk as ODS.
 G - shorthand for multi level grep.
 gitedit - edit last 10 commits.
 gitnext - checkout next revision. Opposite of 'checkout HEAD^'.
 gitundo - undo commit.
--- a/ft-udvalg/ft-udvalg
+++ b/ft-udvalg/ft-udvalg
@ -0,0 +1,200 @@
 #!/usr/bin/python3
 """
 =pod
 =encoding UTF-8
 =head1 NAME
 ft-udvalg - Download udvalgsmembers from folketing.dk as ODS
 =head1 SYNOPSIS
 B<ft-udvalg>
 =head1 DESCRIPTION
 B<ft-udvalg> will walk through REU, BEU, BUU, EPI, ERU, EUU, FIU, FOU,
 FÆU, GRU, BOU, IFU, KIU, KEF, KUU, LIU, MOF, SAU, SOU, SUU, TRU, UFO,
 URU, UUI, ULØ, and UVP, select all the members, add their email
 addresses, and put it in and ODS-file that is easy to use with Auto
 Filter.
 ft.dk requires your IP address to be from Denmark. Otherwise you will
 be blocked by CloudFlare.
 =head1 EXAMPLE
 Generate ft-udvalgsmedlemmer.ods:
  ft-udvalg
 =head1 AUTHOR
 Copyright (C) 2024 Ole Tange,
 http://ole.tange.dk and Free Software Foundation, Inc.
 =head1 LICENSE
 Copyright (C) 2012 Free Software Foundation, Inc.
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 at your option any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 =head1 DEPENDENCIES
 B<ft-udvalg> uses B<python3>, and a number of Python modules.
 =head1 SEE ALSO
 B<python3>
 =cut
 """
 import os
 import logging
 import requests
 import requests_cache
 from bs4 import BeautifulSoup
 import pandas as pd
 import PyPDF2
 import re
 # Enable logging for requests-cache
 logging.basicConfig(level=logging.DEBUG)
 # Initialize the cache
 cache_dir = os.path.expanduser("~/.cache/ft-udvalg")
 requests_cache.install_cache(cache_name=cache_dir, backend='sqlite', expire_after=86400)  # Cache expires after 1 day
 base_url = "https://www.ft.dk"
 udvalg = [
    "reu", "beu", "buu", "epi", "eru", "euu", "fiu", "fou", "fæu",
    "gru", "bou", "ifu", "kiu", "kef", "kuu", "liu", "mof", "sau",
    "sou", "suu", "tru", "ufo", "uru", "uui", "ulø", "uvp"
 ]
 # Step 1: Extract member links from the provided URL
 def extract_members(udvalg_url):
    response = requests.get(udvalg_url)
    logging.debug(f"Fetching {udvalg_url}, from cache: {response.from_cache}")
    soup = BeautifulSoup(response.text, 'html.parser')
    members = []
    for td_tag in soup.find_all('td', {'data-title': 'Navn'}):
        a_tag = td_tag.find('a', href=True)
        if a_tag:
            url = a_tag['href'] if a_tag['href'].startswith(base_url + '/medlemmer/mf/') else base_url + a_tag['href']
            members.append({"biopage": url})
    return members
 # Step 2: Extract the name and PDF URL for each member
 def extract_pdf_url(member_url):
    response = requests.get(member_url)
    logging.debug(f"Fetching {member_url}, from cache: {response.from_cache}")
    soup = BeautifulSoup(response.text, 'html.parser')
    name = soup.find('h1', class_='biography-page-title').text.strip()
    match = re.match(r'^(.*)\s\((.*)\)$', name)
    if match:
        name, party = match.groups()
    else:
        raise ValueError("Text format does not match 'Name (Party)'")
    pdf_url = next((button['href'] for button in soup.select('a.download__container__docBtns__btn') if "CV" in button.get_text()), None)
    if pdf_url and not pdf_url.startswith(base_url):
        pdf_url = base_url + pdf_url
    return {'Navn': name, 'Parti': party, 'CV': pdf_url}
 # Step 3: Extract email from the PDF
 def extract_email_from_pdf(member):
    pdf_url = member["CV"]
    if not pdf_url:
        return None
    try:
        response = requests.get(pdf_url)
        logging.debug(f"Fetching {pdf_url}, from cache: {response.from_cache}")
        pdf_path = 'temp.pdf'
        with open(pdf_path, 'wb') as file:
            file.write(response.content)
        reader = PyPDF2.PdfFileReader(pdf_path)
        email = None
        for page_num in range(reader.numPages):
            # Replace \xad with -
            text = (reader.getPage(page_num).extract_text()).replace('\xad', '-')
            email_match = re.search(r'E[^a-z]*mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
            if email_match:
                email = email_match.group(1)
                break
        return email
    except PyPDF2.errors.PdfReadError as e:
        logging.error(f"Failed to read PDF for member {member['Navn']} with URL {pdf_url}: {e}")
        return None
    except Exception as e:
        logging.error(f"An error occurred while processing member {member['Navn']} with URL {pdf_url}: {e}")
        return None
    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
 # Process members from each committee
 udv_members = {}
 for udv in udvalg:
    udvalg_url = f"{base_url}/da/udvalg/udvalgene/{udv}/medlemsoversigt"
    udv_members[udv] = extract_members(udvalg_url)
 # Consolidate members
 members = {}
 for udv, member_list in udv_members.items():
    for member in member_list:
        if member["biopage"] not in members:
            members[member["biopage"]] = {"biopage": member["biopage"]}
        members[member["biopage"]][udv.upper()] = "X"  # Mark membership
 # Extract additional data for each unique member
 for member in members.values():
    pdf_data = extract_pdf_url(member["biopage"])
    member.update(pdf_data)
    member['Email'] = extract_email_from_pdf(member) if member["CV"] else None
 # Convert the members dictionary to a list of dictionaries
 members_list = list(members.values())
 # Define the column order
 sorted_udvalg = sorted(udvalg)
 columns_order = ['Navn', 'Parti', 'Email', 'biopage', 'CV'] + [udv.upper() for udv in sorted_udvalg]
 # Step 4: Save the extracted data to an ODS file
 df = pd.DataFrame(members_list)
 # Reorder columns
 df = df.reindex(columns=columns_order)
 df.to_excel('ft-udvalgsmedlemmer.ods', index=False)
 print("Data has been successfully saved to ft-udvalgsmedlemmer.ods")