Scrape a webpage to create a list of all pdfs on the webpage¶

import requests # for scraping webpages
import re # regular expressions

See another Regex cheat sheet with examples here: http://www.rexegg.com/regex-quickstart.html

First, we access webpage http://curca.buffalo.edu/students/posters2016.php and save html content into a string

url = 'http://curca.buffalo.edu/students/posters2016.php' # web address
s = requests.get(url)#get html as a long string
s.text[:100] # view some of the string

'<!DOCTYPE html>\r\n<html lang="en">\r\n\t<head>\r\n\t\t<title>Celebration of Student Academic Excellence | Ce'

Next we look for links to htmls, which will look like:

#   <a href="pdfs/2016_posters/Lowe.pdf">509 Michigan Avenue Redevelopment</a>

myre = '<a href="(pdfs/2016_posters/.+\.pdf)'  # search_string
# NOTES:
#       1. This obviously uses that all pdfs are in a folder pdfs/2016_posters
#       2. parentheses returns only the web address and not the html code '<a href ...' 
#       3. Why did we use the .+, which means 1 or more characters

matches = re.findall(myre,s.text) # strings found

# Note that this prefix comes before the folder 'pdfs' in the full web address
prefix = 'http://curca.buffalo.edu/students/' 

# Add prefic to the strings
for match in matches:
    pdfs = prefix + match
#pdfs = [prefix+match for match in matches ]

# Lets look at the first 10
pdfs[:10]

'http://cur'

Now we will download these pdfs into a folder on your computer

We will download them using the 'wget' function in linux

import os #This 'os' module, or 'operating system' module allows use to run linux commands in python

# Make a folder for the pdfs
folder_name = 'pdf_folder'
command1 = 'mkdir '+folder_name 
os.system(command1)


# LINUX COMMAND TO GET FILES FROM INTERNET

# the curl format is     curl url > new_file_name

# the wget format is     wget url -P /path/to/folder
#                        wget url -O /path/to/folder/file_name.pdf


# wget requires the FULL folder name. We define this string using os.getcwd
print(os.getcwd())

full_folder_name =  os.getcwd().replace(" ", "\ ") + '/' + folder_name
#print(full_folder_name)

# For each PDF, create a command that will  download the file
# use curl of wget depending on your operating system and/or preference
#for pdf in pdfs:
    #command = 'wget ' + pdf + ' -P ' + full_folder_name
    #command = 'curl ' + pdf + ' > ' + folder_name+'/'+pdf.split('/')[-1]
    #print(command)

/Users/dallastaylor/Desktop/Dropbox/Teaching/Teaching Sp18/Data Oriented Computed/Course Materials/Lectures/class6/class6_files

# OK Lets grab them!

for pdf in pdfs:
    #command = 'wget ' + pdf + ' -P ' + full_folder_name
    command = 'curl ' + pdf + ' > ' + folder_name+'/'+pdf.split('/')[-1]
    #print(command)
    #os.system(command)

If you want to capture the output with your code, you'll need a more complicated method: Google "python subprocess.Popen".