form_extractor.py
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from pprint import pprint
# initialize an HTTP session
session = HTMLSession()
def get_all_forms(url):
"""Returns all form tags found on a web page's `url` """
# GET request
res = session.get(url)
# for javascript driven website
# res.html.render()
soup = BeautifulSoup(res.html.html, "html.parser")
return soup.find_all("form")
def get_form_details(form):
"""Returns the HTML details of a form,
including action, method and list of form controls (inputs, etc)"""
details = {}
# get the form action (requested URL)
action = form.attrs.get("action")
if action:
action = action.lower()
# get the form method (POST, GET, DELETE, etc)
# if not specified, GET is the default in HTML
method = form.attrs.get("method", "get").lower()
# get all form inputs
inputs = []
for input_tag in form.find_all("input"):
# get type of input form control
input_type = input_tag.attrs.get("type", "text")
# get name attribute
input_name = input_tag.attrs.get("name")
# get the default value of that input tag
input_value =input_tag.attrs.get("value", "")
# add everything to that list
inputs.append({"type": input_type, "name": input_name, "value": input_value})
for select in form.find_all("select"):
# get the name attribute
select_name = select.attrs.get("name")
# set the type as select
select_type = "select"
select_options = []
# the default select value
select_default_value = ""
# iterate over options and get the value of each
for select_option in select.find_all("option"):
# get the option value used to submit the form
option_value = select_option.attrs.get("value")
if option_value:
select_options.append(option_value)
if select_option.attrs.get("selected"):
# if 'selected' attribute is set, set this option as default
select_default_value = option_value
if not select_default_value and select_options:
# if the default is not set, and there are options, take the first option as default
select_default_value = select_options[0]
# add the select to the inputs list
inputs.append({"type": select_type, "name": select_name, "values": select_options, "value": select_default_value})
for textarea in form.find_all("textarea"):
# get the name attribute
textarea_name = textarea.attrs.get("name")
# set the type as textarea
textarea_type = "textarea"
# get the textarea value
textarea_value = textarea.attrs.get("value", "")
# add the textarea to the inputs list
inputs.append({"type": textarea_type, "name": textarea_name, "value": textarea_value})
# put everything to the resulting dictionary
details["action"] = action
details["method"] = method
details["inputs"] = inputs
return details
if __name__ == "__main__":
import sys
# get URL from the command line
url = sys.argv[1]
# get all form tags
forms = get_all_forms(url)
# iteratte over forms
for i, form in enumerate(forms, start=1):
form_details = get_form_details(form)
print("="*50, f"form #{i}", "="*50)
pprint(form_details)
form_submitter.py
from bs4 import BeautifulSoup
from pprint import pprint
from urllib.parse import urljoin
import webbrowser
import sys
from form_extractor import get_all_forms, get_form_details, session
# get the URL from the command line
url = sys.argv[1]
all_forms = get_all_forms(url)
# get the first form (edit this as you wish)
# first_form = get_all_forms(url)[0]
for i, f in enumerate(all_forms, start=1):
form_details = get_form_details(f)
print(f"{i} #")
pprint(form_details)
print("="*50)
choice = int(input("Enter form indice: "))
# extract all form details
form_details = get_form_details(all_forms[choice-1])
pprint(form_details)
# the data body we want to submit
data = {}
for input_tag in form_details["inputs"]:
if input_tag["type"] == "hidden":
# if it's hidden, use the default value
data[input_tag["name"]] = input_tag["value"]
elif input_tag["type"] == "select":
for i, option in enumerate(input_tag["values"], start=1):
# iterate over available select options
if option == input_tag["value"]:
print(f"{i} # {option} (default)")
else:
print(f"{i} # {option}")
choice = input(f"Enter the option for the select field '{input_tag['name']}' (1-{i}): ")
try:
choice = int(choice)
except:
# choice invalid, take the default
value = input_tag["value"]
else:
value = input_tag["values"][choice-1]
data[input_tag["name"]] = value
elif input_tag["type"] != "submit":
# all others except submit, prompt the user to set it
value = input(f"Enter the value of the field '{input_tag['name']}' (type: {input_tag['type']}): ")
data[input_tag["name"]] = value
# join the url with the action (form request URL)
url = urljoin(url, form_details["action"])
# pprint(data)
if form_details["method"] == "post":
res = session.post(url, data=data)
elif form_details["method"] == "get":
res = session.get(url, params=data)
# the below code is only for replacing relative URLs to absolute ones
soup = BeautifulSoup(res.content, "html.parser")
for link in soup.find_all("link"):
try:
link.attrs["href"] = urljoin(url, link.attrs["href"])
except:
pass
for script in soup.find_all("script"):
try:
script.attrs["src"] = urljoin(url, script.attrs["src"])
except:
pass
for img in soup.find_all("img"):
try:
img.attrs["src"] = urljoin(url, img.attrs["src"])
except:
pass
for a in soup.find_all("a"):
try:
a.attrs["href"] = urljoin(url, a.attrs["href"])
except:
pass
# write the page content to a file
open("page.html", "w").write(str(soup))
# open the page on the default browser
webbrowser.open("page.html")