This is code that will pull each job posting for a specific job title in a specific location (or Nationally) and return / plot the percentage of the postings that have certain keywords. The code is set up to search for all words except stopwords, and other user-defined words (there is probably a much more efficient way of doing this, but I had no need to change this once I had the code running). This allows the user to see common technical skills, as well as common soft skills that should be included on a resume.

NOTE: I got this idea from https://jessesw.com/Data-Science-Skills/. Obviously, just using his code would be of no real benefit to me, as I wanted to use the idea to help better my skills with scraping data from HTML files. So, I used his idea and developed my own code from scratch. I also modified the overall process a bit to better fit my needs.

NOTE2: This code will not be able to identify multiple-word skills. So, for example, ‘machine learning’ will show up as either ‘machine’ or ‘learning’. However, ‘machine’ could show up for other phrases than ‘machine learning’.

To run the code, change the city, state, and job title to whichever you wish. After generating the plot, you might need to add ‘keywords’ to the attitional_stop_words list if you do not want them to be included.

from bs4 import BeautifulSoup
import urllib
import re
from time import sleep
from collections import Counter
from nltk.corpus import stopwords
import pandas as pd
%matplotlib inline
import matplotlib.pylab as plt
from matplotlib.backends.backend_pdf import PdfPages
plt.rcParams['figure.figsize'] = (10.0, 8.0)

Define the city, state, and job title.

city = 'Seattle'
state = 'WA'
job_title = 'Data Scientist'

Define a function that will take the url and pull out the text of the main body as a list of strings. Remove common words such as ‘the’, ‘or’, ‘and’, etc.

def clean_the_html(url):
    # First try to download the html file
    try:
        html = urllib.urlopen(url)
    except:
        return
    
    #print url
    
    # Open html in BeautifulSoup
    soup = BeautifulSoup(html)
        
    # Extract everything within the <p> tags
    text = soup.findAll('body')
    word_list = ''
    for line in text:
        word_list = ' '.join([word_list,line.get_text(' ',strip=True).lower()])
    
    # Remove non text characters from list
    word_list = re.sub('[^a-zA-Z+3]',' ', word_list)

    list_of_words = word_list.encode('ascii','ignore').split()
                   
    stop_words = set(stopwords.words("english"))
    additional_stop_words = ['webfont','limited','saved','disability',\
                             'desirable','nreum','skills','net','+','k',\
                            'above','it','end','excellent','join','want',\
                            'how','well','sets','like','page','home','demonstrated',\
                            'other','re','size','etc','gettime','work','ms',\
                            'zqdxyrmad','description','value','re','transactionname',\
                            'education','daylight','highly','bodyrendered',\
                            'amazon','new','bam','techniques','com',city.lower(),\
                            state.lower(),'min','need','email','job','content','features',\
                            'service','wa','id','modern','looking','eastern',\
                            'qualifications','teams','based','false','times',\
                            'software','career','ability','platform','years','data',\
                            'date','product','team','time','agent','information',\
                            'methods','candidate','customers','back','info','scientist',\
                            'experience','apply','us','engineering','learning',\
                            'strong','business','design','title','large','e','document',\
                            'science','company','location','field','communication',\
                            'customer','tools','used','research','model',\
                            'opportunity','online','including','degree',\
                            'preferred','across','beacon','using','friend','function',\
                            'position','window','role','3','written','build',\
                            'presentation','getelementbyid','technical','posted',\
                            'newrelic','decision','log','errorbeacon','solutions',\
                            'applicationtime','enable','responsibilities',\
                            'models','applicationid','complex','licensekey',\
                            'high','browser','d','nr','develop','please',\
                            'selection','queuetime','cookies','icimsaddonload',\
                            'computer','icims','scientists','great','returning',\
                            'systems','writing','united','working','iframe',\
                            'analyses','applications','try','related',\
                            'states','languages','yghvbe','language','one',\
                            'site','llc,','category','personalized','knowledge']
    
    # Remove words from list
    truncated_list = [w for w in list_of_words if not (w in stop_words or \
                      w in additional_stop_words)]
    
    truncated_set = set(truncated_list)
    truncated_list = list(truncated_set)
        
    return truncated_list

Define a function to generate a list of urls for a given search (i.e., ‘Data Scientist’). Each search result page has 10 non-sponsored links. Search the first url for ‘Jobs # to # of ###’ in order to determine how many iterations to perform.

def gen_url_list(city,state,job_name):
    base_url = 'http://www.indeed.com/'
    
    job_term = re.sub(' ','+',job_name.lower())
    
    search_url = ''.join([base_url,'jobs?q=',job_term,'&l=',city,'%2C+',state])
    
    try:
        html = urllib.urlopen(search_url)
    except:
        return
    
    soup = BeautifulSoup(html)
    
    total_jobs = soup.find(id = 'searchCount').string.encode('utf-8')
    job_nums = int([int(s) for s in total_jobs.split() if s.isdigit()][-1]/10)
    print total_jobs

    job_URLS = []
    for i in range(job_nums+1):
        if i % 10 == 0:
            print i
        page_url = ''.join([base_url,job_term,'&1=',city,'%2C+',state,\
                            '&start=',str((i+1)*10)])
        html = urllib.urlopen(search_url)
        
        soup = BeautifulSoup(html)
        
        job_link_area = soup.findAll('h2',{'class':'jobtitle'})

        for link in job_link_area:
            match_href = re.search('<a\shref="(.+?)"',str(link))
            if match_href:
                job_URLS.append([base_url + match_href.group(1)])

    return job_URLS

Now that we have a list of all of the URLs of job postings, pull the information from each site, clean the data, and populate the keyword list.

def job_posting_analysis(url_list):
    job_skills = []
    count = 0
    for url in url_list:
        count += 1
        if count % 10 == 1:
            print count
        
        posting_keywords = clean_the_html(url[0])
        if posting_keywords:
            job_skills.append(posting_keywords)
        sleep(0.5)
        
    return job_skills

Now that the various functions are defined, run the code.

First: run gen_url_list for the specified city, state, and jobtitle in order to generate
the list of job posting links

Second: run job_posting_analysis to pull out the job_skills listed for each job posting.

print 'Crawl indeed.com for ' + city + ', ' + state + ' ' + job_title + \
' postings and generate a list of all of the job posting links'

url_list = gen_url_list(city,state,job_title)

print "Given the job posting links, pull out the keywords for each posting"

job_skills = job_posting_analysis(url_list)

Crawl indeed.com for Seattle, WA Data Scientist postings and generate a list of all of the job posting links
Jobs 1 to 10 of 725
0
10
20
30
40
50
60
70
Given the job posting links, pull out the keywords for each posting that is found in the provided keywords_input variable
1
11
truncated for readability

Now that we have the list of keywords in the job postings, calculate the number of postings in which each keyword appears. Then plot the data on a bar graph

skill_frequency = Counter() # This will create a full counter of our terms. 
[skill_frequency.update(item) for item in job_skills] # List comp
print skill_frequency.items()

[('addedtojobcart', 73), ('applicationstatusdetail', 73), ('auc', 73), ('matlab', 365), ('worth', 73), ('merchant', 73), ('collaborate', 219), ('every', 146), ('tagging', 73), ('skillz', 73), ('companies', 219), ('vector', 73), ('clicktracks', 73), ('enhance', 146), ('enjoy', 73), ('leaders', 146), ('direct', 73), ('rigorous', 73), ('machines', 73), ('even', 73), ('hide', 73), ('selected', 73), ('children', 73), ('designing', 73), ('supplies', 73), ('centric', 73), ('behavior', 73), ('men', 73), ('createde', 73), ('hundreds', 73), ('employees', 146), ('economics', 146), ('reports', 73), truncated for readability ]

data_to_plot = pd.DataFrame(skill_frequency.items(),columns = ['Skill','Occurances'])

data_to_plot.Occurances = (data_to_plot.Occurances)*100/len(job_skills)

data_to_plot.sort(columns = 'Occurances',ascending = False,inplace = True)

test_data = data_to_plot.head(15)  # plot only top 15 skills

print data_to_plot.head(20)

            Skill  Occurances
1148       python   80.109739
66        machine   80.109739
500             r   80.109739
329           sql   80.109739
468    statistics   70.096022
431   statistical   70.096022
544            js   60.082305
476   programming   60.082305
1382          pig   60.082305
1066         hive   60.082305
934        hadoop   60.082305
77     algorithms   50.068587
187     analytics   50.068587
55      scripting   50.068587
1242   predictive   50.068587
161            d3   50.068587
3          matlab   50.068587
943      analysis   50.068587
610         world   40.054870
341           llc   40.054870

frame = test_data.plot(x='Skill',kind='bar',legend=None,\
                  title='Percentage of Data Scientist Job Postings with each Skill, ('\
                  + city + ', ' + state + ')')

#plt.ylim([40,90])

fig = frame.get_figure()

filename = '_'.join([city,state,'skills'])
filename = ''.join([filename,'.pdf'])
pp = PdfPages(filename)
pp.savefig(fig)
pp.close()

Tech N Comp

Scrape Keywords from Indeed.com Job Postings

10 thoughts on “Scrape Keywords from Indeed.com Job Postings”

Leave a Reply Cancel reply

Related posts

10 thoughts on “Scrape Keywords from Indeed.com Job Postings”

Leave a Reply Cancel reply