HOW TO: Get a newspaper issue or article as a PDF

18.1. HOW TO: Get a newspaper issue or article as a PDF#

You can download PDFs of newspaper and gazette articles, pages, and issues from Trove’s web interface – it’s just a matter of clicking a button. But downloading PDFs using computational methods is not so straightforward. When you click on the buttons in the web interface, you don’t download the PDF from a fixed url. There’s a bit of Javascript code behind the button that asks for for the PDF to be compiled, then alerts the user when it’s ready. To automate the download process, you need to reproduce these steps in your code. This how-to provides an example of how this can be done using Python.

But what about pages?

Newspaper and gazette pages are treated slightly differently to articles and issues. If you know the page identifier, you can construct a url that will download that page as a PDF without any waiting!

import time
from pathlib import Path

import requests
from requests.exceptions import HTTPError

def ping_pdf(ping_url):
    """
    Check to see if a PDF is ready for download.
    If a 200 status code is received, return True.
    """
    ready = False
    try:
        response = requests.get(ping_url, timeout=30)
        response.raise_for_status()
    except HTTPError:
        if response.status_code == 423:
            ready = False
        else:
            raise
    else:
        ready = True
    return ready


def get_pdf_url(id, pdf_type, zoom=4):
    """
    Download the PDF version of an issue.
    These can take a while to generate, so we need to ping the server to see if it's ready before we download.
    """
    pdf_url = None

    base_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-{pdf_type}{id}"

    if pdf_type == "article":
        prep_url = f"{base_url}/level/{zoom}/prep"
        base_url += f".{zoom}"
    else:
        prep_url = f"{base_url}/prep"

    # Ask for the PDF to be created, this returns a plain text hash that we use in later requests
    response = requests.get(prep_url)

    # Get the hash
    prep_id = response.text

    # Url to check if the PDF is ready
    ping_url = f"{base_url}.ping?followup={prep_id}"
    tries = 0
    ready = False

    # Give some time to generate pdf
    time.sleep(2)

    # Are you ready yet?
    while ready is False and tries < 5:
        ready = ping_pdf(ping_url)
        if not ready:
            tries += 1
            time.sleep(2)

            # Download if ready
    if ready:
        pdf_url = f"{base_url}.pdf?followup={prep_id}"
    return pdf_url

Get a PDF of an issue#

# Set issue id -- in practice, this would probably be in a loop, accessing a list of issues
issue_id = "424530"

# Get the PDF url
pdf_url = get_pdf_url(issue_id, "issue")

# Download and save the PDF
response = requests.get(pdf_url)
Path(f"issue-{issue_id}.pdf").write_bytes(response.content)

---------------------------------------------------------------------------
TimeoutError                              Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:467, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    463         except BaseException as e:
    464             # Remove the TypeError from the exception chain in
    465             # Python 3 (including for exceptions like SystemExit).
    466             # Otherwise it looks like a bug in the code.
--> 467             six.raise_from(e, None)
    468 except (SocketTimeout, BaseSSLError, SocketError) as e:

File <string>:3, in raise_from(value, from_value)

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:462, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    461 try:
--> 462     httplib_response = conn.getresponse()
    463 except BaseException as e:
    464     # Remove the TypeError from the exception chain in
    465     # Python 3 (including for exceptions like SystemExit).
    466     # Otherwise it looks like a bug in the code.

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
   1374 try:
-> 1375     response.begin()
   1376 except ConnectionError:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:318, in HTTPResponse.begin(self)
    317 while True:
--> 318     version, status, reason = self._read_status()
    319     if status != CONTINUE:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:279, in HTTPResponse._read_status(self)
    278 def _read_status(self):
--> 279     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    280     if len(line) > _MAXLINE:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/socket.py:717, in SocketIO.readinto(self, b)
    716 try:
--> 717     return self._sock.recv_into(b)
    718 except timeout:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/ssl.py:1307, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1304         raise ValueError(
   1305           "non-zero flags not allowed in calls to recv_into() on %s" %
   1306           self.__class__)
-> 1307     return self.read(nbytes, buffer)
   1308 else:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/ssl.py:1163, in SSLSocket.read(self, len, buffer)
   1162 if buffer is not None:
-> 1163     return self._sslobj.read(len, buffer)
   1164 else:

TimeoutError: The read operation timed out

During handling of the above exception, another exception occurred:

ReadTimeoutError                          Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    485 try:
--> 486     resp = conn.urlopen(
    487         method=request.method,
    488         url=url,
    489         body=request.body,
    490         headers=request.headers,
    491         redirect=False,
    492         assert_same_host=False,
    493         preload_content=False,
    494         decode_content=False,
    495         retries=self.max_retries,
    496         timeout=timeout,
    497         chunked=chunked,
    498     )
    500 except (ProtocolError, OSError) as err:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:799, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    797     e = ProtocolError("Connection aborted.", e)
--> 799 retries = retries.increment(
    800     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    801 )
    802 retries.sleep()

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    549 if read is False or not self._is_method_retryable(method):
--> 550     raise six.reraise(type(error), error, _stacktrace)
    551 elif read is not None:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
    769         raise value.with_traceback(tb)
--> 770     raise value
    771 finally:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:715, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    714 # Make the request on the httplib connection object.
--> 715 httplib_response = self._make_request(
    716     conn,
    717     method,
    718     url,
    719     timeout=timeout_obj,
    720     body=body,
    721     headers=headers,
    722     chunked=chunked,
    723 )
    725 # If we're going to release the connection in ``finally:``, then
    726 # the response doesn't need to know about the connection. Otherwise
    727 # it will also try to release it and we'll have a double-release
    728 # mess.

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:469, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    468 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 469     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
    470     raise

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:358, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
    357 if isinstance(err, SocketTimeout):
--> 358     raise ReadTimeoutError(
    359         self, url, "Read timed out. (read timeout=%s)" % timeout_value
    360     )
    362 # See the above comment about EAGAIN in Python 3. In Python 2 we have
    363 # to specifically catch it and throw the timeout error

ReadTimeoutError: HTTPSConnectionPool(host='trove.nla.gov.au', port=443): Read timed out. (read timeout=30)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
Cell In[3], line 5
      2 issue_id = "424530"
      4 # Get the PDF url
----> 5 pdf_url = get_pdf_url(issue_id, "issue")
      7 # Download and save the PDF
      8 response = requests.get(pdf_url)

Cell In[2], line 51, in get_pdf_url(id, pdf_type, zoom)
     49 # Are you ready yet?
     50 while ready is False and tries < 5:
---> 51     ready = ping_pdf(ping_url)
     52     if not ready:
     53         tries += 1

Cell In[2], line 8, in ping_pdf(ping_url)
      6 ready = False
      7 try:
----> 8     response = requests.get(ping_url, timeout=30)
      9     response.raise_for_status()
     10 except HTTPError:

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/api.py:73, in get(url, params, **kwargs)
     62 def get(url, params=None, **kwargs):
     63     r"""Sends a GET request.
     64 
     65     :param url: URL for the new :class:`Request` object.
   (...)
     70     :rtype: requests.Response
     71     """
---> 73     return request("get", url, params=params, **kwargs)

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/api.py:59, in request(method, url, **kwargs)
     55 # By using the 'with' statement we are sure the session is closed, thus we
     56 # avoid leaving sockets open which can trigger a ResourceWarning in some
     57 # cases, and look like a memory leak in others.
     58 with sessions.Session() as session:
---> 59     return session.request(method=method, url=url, **kwargs)

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    530     raise SSLError(e, request=request)
    531 elif isinstance(e, ReadTimeoutError):
--> 532     raise ReadTimeout(e, request=request)
    533 elif isinstance(e, _InvalidHeader):
    534     raise InvalidHeader(e, request=request)

ReadTimeout: HTTPSConnectionPool(host='trove.nla.gov.au', port=443): Read timed out. (read timeout=30)

Get a PDF of an article#

# Set article id -- in practice, this would probably be in a loop, accessing a list of articles
article_id = "61389505"

# Get the PDF url
pdf_url = get_pdf_url(article_id, "article")

# Download and save the PDF
response = requests.get(pdf_url)
Path(f"article-{article_id}.pdf").write_bytes(response.content)

7f4e2fd0902e83aa398e3dc2ac3bb055

HOW TO: Get a newspaper issue or article as a PDF

Contents

18.1. HOW TO: Get a newspaper issue or article as a PDF#

Get a PDF of an issue#

Get a PDF of an article#