I found this great Pythonic HTML parser today and wrote a quick script for snagging the links off a page.
linkpull.py
# !/usr/bin/python
# takes URL as input, outputs all links to a file
# requires the excellent pythonic HTML parser, requests_hmtl
# parser source: https://github.com/kennethreitz/requests-html
from requests_html import HTMLSession
import os, sys
def maketextfile(linklist, filename):
with open(filename, 'a+') as file:
for link in linklist:
file.write(link + '\n')
def main():
print("\nThis is a tiny link scraper!\n")
# destination file for your links:
filename = 'linklist.txt'
# get URL source:
url = input("Enter a complete URL: ")
session = HTMLSession()
r = session.get(url)
linklist = [link for link in r.html.absolute_links]
maketextfile(linklist, filename)
print("\nFinished! Your file {} is located at \n {}\n".format(filename, os.getcwd()))
main()
(part of my tiny scripts repository on github.)