import requests # for scraping webpages
import re # regular expressions
See another Regex cheat sheet with examples here: http://www.rexegg.com/regex-quickstart.html
First, we access webpage http://curca.buffalo.edu/students/posters2016.php and save html content into a string
url = 'http://curca.buffalo.edu/students/posters2016.php' # web address
s = requests.get(url)#get html as a long string
s.text[:100] # view some of the string
Next we look for links to htmls, which will look like:
# <a href="pdfs/2016_posters/Lowe.pdf">509 Michigan Avenue Redevelopment</a>
myre = '<a href="(pdfs/2016_posters/.+\.pdf)' # search_string
# NOTES:
# 1. This obviously uses that all pdfs are in a folder pdfs/2016_posters
# 2. parentheses returns only the web address and not the html code '<a href ...'
# 3. Why did we use the .+, which means 1 or more characters
matches = re.findall(myre,s.text) # strings found
# Note that this prefix comes before the folder 'pdfs' in the full web address
prefix = 'http://curca.buffalo.edu/students/'
# Add prefic to the strings
for match in matches:
pdfs = prefix + match
#pdfs = [prefix+match for match in matches ]
# Lets look at the first 10
pdfs[:10]
Now we will download these pdfs into a folder on your computer
We will download them using the 'wget' function in linux
import os #This 'os' module, or 'operating system' module allows use to run linux commands in python
# Make a folder for the pdfs
folder_name = 'pdf_folder'
command1 = 'mkdir '+folder_name
os.system(command1)
# LINUX COMMAND TO GET FILES FROM INTERNET
# the curl format is curl url > new_file_name
# the wget format is wget url -P /path/to/folder
# wget url -O /path/to/folder/file_name.pdf
# wget requires the FULL folder name. We define this string using os.getcwd
print(os.getcwd())
full_folder_name = os.getcwd().replace(" ", "\ ") + '/' + folder_name
#print(full_folder_name)
# For each PDF, create a command that will download the file
# use curl of wget depending on your operating system and/or preference
#for pdf in pdfs:
#command = 'wget ' + pdf + ' -P ' + full_folder_name
#command = 'curl ' + pdf + ' > ' + folder_name+'/'+pdf.split('/')[-1]
#print(command)
# OK Lets grab them!
for pdf in pdfs:
#command = 'wget ' + pdf + ' -P ' + full_folder_name
command = 'curl ' + pdf + ' > ' + folder_name+'/'+pdf.split('/')[-1]
#print(command)
#os.system(command)
If you want to capture the output with your code, you'll need a more complicated method: Google "python subprocess.Popen".