Beautiful Soup Exercises
Problem 1: You have a basic HTML string for a webpage. Parse this HTML using Beautiful Soup and extract just the text inside the main < h1 > heading.
beautiful-soup-exercises.py
from bs4 import BeautifulSoup
# 1. The raw HTML data
html_doc = """
<html>
<head><title>My Tech Blog</title></head>
<body>
<h1>Welcome to Web Scraping!</h1>
<p>This is a paragraph of text.</p>
</body>
</html>
"""
# 2. Parse the HTML
soup = BeautifulSoup(html_doc, 'html.parser')
# 3. Find the h1 tag and extract its text
heading_text = soup.find('h1').text
# Alternatively, a shortcut for the first h1 tag: soup.h1.text
print("Raw HTML type:", type(html_doc))
print("Extracted Heading:", heading_text)
Problem 2: You are scraping a recipe website and have an HTML list of ingredients. Use Beautiful Soup to find all the list items (< li>) and save their text into a standard Python list.
beautiful-soup-exercises.py
from bs4 import BeautifulSoup
# 1. The raw HTML data
html_ingredients = """
<div class="recipe">
<h3>Pancake Ingredients</h3>
<ul>
<li>1 cup flour</li>
<li>2 tablespoons sugar</li>
<li>1 cup milk</li>
<li>1 egg</li>
</ul>
</div>
"""
# 2. Parse the HTML
soup = BeautifulSoup(html_ingredients, 'html.parser')
# 3. Find all <li> tags
li_tags = soup.find_all('li')
# 4. Extract the text from each tag using a list comprehension (or a for loop)
ingredients = [tag.text for tag in li_tags]
print("Extracted Ingredients List:")
print(ingredients)
Problem 3: You are analyzing a webpage that has several links (< a> tags). You only want to extract the actual URLs (the href attribute) of the links that lead to external websites. These specific links have been given the HTML class “external”.
beautiful-soup-exercises.py
from bs4 import BeautifulSoup
# 1. The raw HTML data
html_links = """
<body>
<p>Check out our <a href="/about-us" class="internal">About Us</a> page.</p>
<p>Learn more Python on <a href="https://docs.python.org" class="external">Python Docs</a>.</p>
<p>Follow us on <a href="https://twitter.com" class="external">Twitter</a>.</p>
</body>
"""
# 2. Parse the HTML
soup = BeautifulSoup(html_links, 'html.parser')
# 3. Find all 'a' tags that specifically have the class "external"
# Note: We use 'class_' with an underscore because 'class' is a reserved keyword in Python
external_links = soup.find_all('a', class_='external')
# 4. Extract the 'href' attribute from those links
urls = [link['href'] for link in external_links]
print("Found External URLs:")
for url in urls:
print("-", url)