#100DaysofCode – link scraping script (python)

I found this great Pythonic HTML parser today and wrote a quick script for snagging the links off a page.


linkpull.py

# !/usr/bin/python

# takes URL as input, outputs all links to a file
# requires the excellent pythonic HTML parser, requests_hmtl
# parser source: https://github.com/kennethreitz/requests-html

from requests_html import HTMLSession
import os, sys

def maketextfile(linklist, filename):
    with open(filename, 'a+') as file:
        for link in linklist:
            file.write(link + '\n')

def main():
    print("\nThis is a tiny link scraper!\n")
    
    # destination file for your links:
    filename = 'linklist.txt'
    
    # get URL source:
    url = input("Enter a complete URL: ")
    
    session = HTMLSession()
    r = session.get(url)
    linklist = [link for link in r.html.absolute_links]
    maketextfile(linklist, filename)
    print("\nFinished! Your file {} is located at \n {}\n".format(filename, os.getcwd()))

main()

(part of my tiny scripts repository on github.)

Leave a Comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.