18.1. HOW TO: Get a newspaper issue or article as a PDF#
You can download PDFs of newspaper and gazette articles, pages, and issues from Trove’s web interface – it’s just a matter of clicking a button. But downloading PDFs using computational methods is not so straightforward. When you click on the buttons in the web interface, you don’t download the PDF from a fixed url. There’s a bit of Javascript code behind the button that asks for for the PDF to be compiled, then alerts the user when it’s ready. To automate the download process, you need to reproduce these steps in your code. This how-to provides an example of how this can be done using Python.
But what about pages?
Newspaper and gazette pages are treated slightly differently to articles and issues. If you know the page identifier, you can construct a url that will download that page as a PDF without any waiting!
import time
from pathlib import Path
import requests
from requests.exceptions import HTTPError
def ping_pdf(ping_url):
"""
Check to see if a PDF is ready for download.
If a 200 status code is received, return True.
"""
ready = False
try:
response = requests.get(ping_url, timeout=30)
response.raise_for_status()
except HTTPError:
if response.status_code == 423:
ready = False
else:
raise
else:
ready = True
return ready
def get_pdf_url(id, pdf_type, zoom=4):
"""
Download the PDF version of an issue.
These can take a while to generate, so we need to ping the server to see if it's ready before we download.
"""
pdf_url = None
base_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-{pdf_type}{id}"
if pdf_type == "article":
prep_url = f"{base_url}/level/{zoom}/prep"
base_url += f".{zoom}"
else:
prep_url = f"{base_url}/prep"
# Ask for the PDF to be created, this returns a plain text hash that we use in later requests
response = requests.get(prep_url)
# Get the hash
prep_id = response.text
# Url to check if the PDF is ready
ping_url = f"{base_url}.ping?followup={prep_id}"
tries = 0
ready = False
# Give some time to generate pdf
time.sleep(2)
# Are you ready yet?
while ready is False and tries < 5:
ready = ping_pdf(ping_url)
if not ready:
tries += 1
time.sleep(2)
# Download if ready
if ready:
pdf_url = f"{base_url}.pdf?followup={prep_id}"
return pdf_url
Get a PDF of an issue#
# Set issue id -- in practice, this would probably be in a loop, accessing a list of issues
issue_id = "424530"
# Get the PDF url
pdf_url = get_pdf_url(issue_id, "issue")
# Download and save the PDF
response = requests.get(pdf_url)
Path(f"issue-{issue_id}.pdf").write_bytes(response.content)
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:467, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
463 except BaseException as e:
464 # Remove the TypeError from the exception chain in
465 # Python 3 (including for exceptions like SystemExit).
466 # Otherwise it looks like a bug in the code.
--> 467 six.raise_from(e, None)
468 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:462, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
461 try:
--> 462 httplib_response = conn.getresponse()
463 except BaseException as e:
464 # Remove the TypeError from the exception chain in
465 # Python 3 (including for exceptions like SystemExit).
466 # Otherwise it looks like a bug in the code.
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
1374 try:
-> 1375 response.begin()
1376 except ConnectionError:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:318, in HTTPResponse.begin(self)
317 while True:
--> 318 version, status, reason = self._read_status()
319 if status != CONTINUE:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/http/client.py:279, in HTTPResponse._read_status(self)
278 def _read_status(self):
--> 279 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
280 if len(line) > _MAXLINE:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/socket.py:717, in SocketIO.readinto(self, b)
716 try:
--> 717 return self._sock.recv_into(b)
718 except timeout:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/ssl.py:1307, in SSLSocket.recv_into(self, buffer, nbytes, flags)
1304 raise ValueError(
1305 "non-zero flags not allowed in calls to recv_into() on %s" %
1306 self.__class__)
-> 1307 return self.read(nbytes, buffer)
1308 else:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/ssl.py:1163, in SSLSocket.read(self, len, buffer)
1162 if buffer is not None:
-> 1163 return self._sslobj.read(len, buffer)
1164 else:
TimeoutError: The read operation timed out
During handling of the above exception, another exception occurred:
ReadTimeoutError Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
485 try:
--> 486 resp = conn.urlopen(
487 method=request.method,
488 url=url,
489 body=request.body,
490 headers=request.headers,
491 redirect=False,
492 assert_same_host=False,
493 preload_content=False,
494 decode_content=False,
495 retries=self.max_retries,
496 timeout=timeout,
497 chunked=chunked,
498 )
500 except (ProtocolError, OSError) as err:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:799, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
797 e = ProtocolError("Connection aborted.", e)
--> 799 retries = retries.increment(
800 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
801 )
802 retries.sleep()
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
769 raise value.with_traceback(tb)
--> 770 raise value
771 finally:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:715, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
714 # Make the request on the httplib connection object.
--> 715 httplib_response = self._make_request(
716 conn,
717 method,
718 url,
719 timeout=timeout_obj,
720 body=body,
721 headers=headers,
722 chunked=chunked,
723 )
725 # If we're going to release the connection in ``finally:``, then
726 # the response doesn't need to know about the connection. Otherwise
727 # it will also try to release it and we'll have a double-release
728 # mess.
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:469, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
468 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 469 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
470 raise
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/urllib3/connectionpool.py:358, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
357 if isinstance(err, SocketTimeout):
--> 358 raise ReadTimeoutError(
359 self, url, "Read timed out. (read timeout=%s)" % timeout_value
360 )
362 # See the above comment about EAGAIN in Python 3. In Python 2 we have
363 # to specifically catch it and throw the timeout error
ReadTimeoutError: HTTPSConnectionPool(host='trove.nla.gov.au', port=443): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
ReadTimeout Traceback (most recent call last)
Cell In[3], line 5
2 issue_id = "424530"
4 # Get the PDF url
----> 5 pdf_url = get_pdf_url(issue_id, "issue")
7 # Download and save the PDF
8 response = requests.get(pdf_url)
Cell In[2], line 51, in get_pdf_url(id, pdf_type, zoom)
49 # Are you ready yet?
50 while ready is False and tries < 5:
---> 51 ready = ping_pdf(ping_url)
52 if not ready:
53 tries += 1
Cell In[2], line 8, in ping_pdf(ping_url)
6 ready = False
7 try:
----> 8 response = requests.get(ping_url, timeout=30)
9 response.raise_for_status()
10 except HTTPError:
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/api.py:73, in get(url, params, **kwargs)
62 def get(url, params=None, **kwargs):
63 r"""Sends a GET request.
64
65 :param url: URL for the new :class:`Request` object.
(...)
70 :rtype: requests.Response
71 """
---> 73 return request("get", url, params=params, **kwargs)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/api.py:59, in request(method, url, **kwargs)
55 # By using the 'with' statement we are sure the session is closed, thus we
56 # avoid leaving sockets open which can trigger a ResourceWarning in some
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File /opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
530 raise SSLError(e, request=request)
531 elif isinstance(e, ReadTimeoutError):
--> 532 raise ReadTimeout(e, request=request)
533 elif isinstance(e, _InvalidHeader):
534 raise InvalidHeader(e, request=request)
ReadTimeout: HTTPSConnectionPool(host='trove.nla.gov.au', port=443): Read timed out. (read timeout=30)
Get a PDF of an article#
# Set article id -- in practice, this would probably be in a loop, accessing a list of articles
article_id = "61389505"
# Get the PDF url
pdf_url = get_pdf_url(article_id, "article")
# Download and save the PDF
response = requests.get(pdf_url)
Path(f"article-{article_id}.pdf").write_bytes(response.content)
7f4e2fd0902e83aa398e3dc2ac3bb055
230961