Logging in to LinkedIn with python requests sessions

Posted on

Question :

Logging in to LinkedIn with python requests sessions

I’m trying to log into LinkedIn using Python requests:

import sys
import requests
from BeautifulSoup import BeautifulSoup


payload={
    'session-key' : 'user@email.com',
    'session-password' : 'password'
}

URL='https://www.linkedin.com/uas/login-submit'
s=requests.session()
s.post(URL,data=payload)

r=s.get('http://www.linkedin.com/nhome')
soup = BeautifulSoup(r.text)
print soup.find('title')

I can’t seem to log in using this method. I even tried playing with csrf etc. in the payload, but aren’t sessions supposed to take care of that for you?

Note about the last line: I use the title to check if I’ve successfully logged in. (I should see “Welcome! | LinkedIn” if I have signed in, instead I see “World’s Largest Professional Network | LinkedIn”

Am I missing something?

Asked By: mirri66

||

Answer #1:

I modified a web-scraping template I use for most of my Python-based scraping needs to fit your needs. Verified it worked with my own login info.

The way it works is by mimic-ing a browser and maintaining a cookieJar that stores your user session. Got it to work with BeautifulSoup for you as well.

Note: This is a Python2 version. I added a working Python3 example further below by request.

import cookielib
import os
import urllib
import urllib2
import re
import string
from BeautifulSoup import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib2.build_opener(
            urllib2.HTTPRedirectHandler(),
            urllib2.HTTPHandler(debuglevel=0),
            urllib2.HTTPSHandler(debuglevel=0),
            urllib2.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print title

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join(response.readlines())
        except:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        html = self.loadPage("https://www.linkedin.com/")
        soup = BeautifulSoup(html)
        csrf = soup.find(id="loginCsrfParam-login")['value']

        login_data = urllib.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        })

        html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        html = self.loadPage("https://www.linkedin.com/feed/")
        soup = BeautifulSoup(html)
        return soup.find("title")

parser = LinkedInParser(username, password)

Update June 19, 2014: Added parsing for CSRF token from homepage for use in updated login process.

Update July 23, 2015: Adding a Python 3 example here. Basically requires substituting library locations and removing deprecated methods. It’s not perfectly formatted or anything, but it functions. Sorry for the rush job. In the end the principals and steps are identical.

import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPRedirectHandler(),
            urllib.request.HTTPHandler(debuglevel=0),
            urllib.request.HTTPSHandler(debuglevel=0),
            urllib.request.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print(title)

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join([str(l) for l in response.readlines()])
        except Exception as e:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loadSoup(self, url, data=None):
        """
        Combine loading of URL, HTML, and parsing with BeautifulSoup
        """
        html = self.loadPage(url, data)
        soup = BeautifulSoup(html, "html5lib")
        return soup

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        soup = self.loadSoup("https://www.linkedin.com/")
        csrf = soup.find(id="loginCsrfParam-login")['value']
        login_data = urllib.parse.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        }).encode('utf8')

        self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        soup = self.loadSoup("https://www.linkedin.com/feed/")
        return soup.find("title")

parser = LinkedInParser(username, password)
Answered By: garromark

Answer #2:

This is a much simpler version.

import requests
from bs4 import BeautifulSoup

client = requests.Session()

HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'

html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']

login_information = {
    'session_key':'Login',
    'session_password':'Password',
    'loginCsrfParam': csrf,
}

client.post(LOGIN_URL, data=login_information)

client.get('Any_Linkedin_URL')
Answered By: Morgan Allen

Answer #3:

2019 Version.

Slightly revised version working that takes into account the new structure of the page to find the connection cookie and adds the trk parameter.

import requests
from bs4 import BeautifulSoup

email = ""
password = ""

client = requests.Session()

HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'

html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', {'name': 'loginCsrfParam'}).get('value')

login_information = {
    'session_key': email,
    'session_password': password,
    'loginCsrfParam': csrf,
    'trk': 'guest_homepage-basic_sign-in-submit'
}

client.post(LOGIN_URL, data=login_information)

response = client.get('')
Answered By: Densetsu_No

Answer #4:

2020 version of @garromark’s accepted solution:

import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup

username = ""
password = ""

cookie_filename = "parser.cookies.txt"


class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPRedirectHandler(),
            urllib.request.HTTPHandler(debuglevel=0),
            urllib.request.HTTPSHandler(debuglevel=0),
            urllib.request.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', 'Mozilla/5.0')
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print(title)

        # self.cj.save()

    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            content = ''.join([str(l) for l in response.readlines()])
            print("Page loaded: %s n Content: %s n" % (url, content))
            return content
        except Exception as e:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            print("Exception on %s load: %s" % (url, e))
            # return self.loadPage(url, data)

    def loadSoup(self, url, data=None):
        """
        Combine loading of URL, HTML, and parsing with BeautifulSoup
        """
        html = self.loadPage(url, data)
        soup = BeautifulSoup(html, "html5lib")
        return soup

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        soup = self.loadSoup("https://www.linkedin.com/login")
        loginCsrfParam = soup.find("input", {"name": "loginCsrfParam"})['value']
        csrfToken = soup.find("input", {"name": "csrfToken"})['value']
        sIdString = soup.find("input", {"name": "sIdString"})['value']
        print("loginCsrfParam: %s" % loginCsrfParam)
        print("csrfToken: %s" % csrfToken)
        print("sIdString: %s" % sIdString)
        login_data = urllib.parse.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': loginCsrfParam,
            'csrfToken': csrfToken,
            'sIdString': sIdString
        }).encode('utf8')

        self.loadPage("https://www.linkedin.com/checkpoint/lg/login-submit", login_data)

    def loadTitle(self):
        soup = self.loadSoup("https://www.linkedin.com/feed/")
        return soup.find("title")


parser = LinkedInParser(username, password)

Answer #5:

The OP’s solution worked for me with only a very slight modification.

Change ‘session-key’ to ‘session_key’ and change ‘session-password’ to session_password.’

Other than that, the code is good as it stands.

Answered By: Scott N

Leave a Reply

Your email address will not be published.