9. Take the Institution name as input. Use Pydantic to define the schema for the desired output and create a custom output parser. Invoke the Chain and Fetch Results. Extract the below Institution related details from Wikipedia: The founder of the Institution. When it was founded. The current branches in the institution . How many employees are working in it. A brief 4-line summary of the institution.
PROGRAM:
# Module or library install command (run this in terminal before running the script)
# pip install "pydantic>=2.0" wikipedia-api requests beautifulsoup4 lxml
from pydantic import BaseModel
from typing import List, Optional
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import re
class InstitutionProfile(BaseModel):
founder: Optional[str] = "Unknown"
Established: Optional[int] = 0
branches: List[str] = ["Unknown"]
employee_count: Optional[int] = 0
summary: str
def fetch_wikipedia_data(institution_name: str) -> dict:
user_agent = "MyInstitutionInfoBot/1.0 (contact: [email protected])"
wiki_wiki = wikipediaapi.Wikipedia(user_agent=user_agent, language="en")
page = wiki_wiki.page(institution_name)
if not page.exists():
raise ValueError(f"Wikipedia page for '{institution_name}' not found.")
summary = page.summary[:300]
wiki_url = f"https://en.wikipedia.org/wiki/{institution_name.replace(' ', '_')}"
headers = {"User-Agent": user_agent}
response = requests.get(wiki_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
infobox = soup.find("table", {"class": "infobox"})
data = {
"founder": "Unknown",
"Established": 0,
"branches": ["Unknown"],
"employee_count": 0,
"summary": summary,
}
if infobox:
for row in infobox.find_all("tr"):
header = row.find("th")
value = row.find("td")
if header and value:
key = header.text.strip()
val = value.text.strip()
if key in ["Founder", "Founders", "Founder(s)"]:
data["founder"] = val
elif key in ["Established", "Founded", "Formation"]:
match = re.search(r"\b(18\d{2}|19\d{2}|20\d{2})\b", val)
if match:
data["Established"] = int(match.group(0))
elif key in ["Total staff", "Employees", "Staff"]:
match = re.search(r"(\d{3,5})", val)
if match:
data["employee_count"] = int(match.group(0))
elif key in ["Address", "Location"]:
data["branches"] = [val.split(",")[0]]
if data["founder"] == "Unknown":
dbpedia_data = fetch_dbpedia_data(institution_name)
data.update(dbpedia_data)
return data
def fetch_dbpedia_data(institution_name: str) -> dict:
dbpedia_url = f"https://dbpedia.org/data/{institution_name.replace(' ', '_')}.json"
headers = {"User-Agent": "MyInstitutionInfoBot/1.0"}
response = requests.get(dbpedia_url, headers=headers)
if response.status_code != 200:
return {}
dbpedia_json = response.json()
entity_url = f"http://dbpedia.org/resource/{institution_name.replace(' ', '_')}"
data = {"founder": "Unknown"}
if entity_url in dbpedia_json:
entity = dbpedia_json[entity_url]
if "http://dbpedia.org/ontology/foundedBy" in entity:
data["founder"] = entity["http://dbpedia.org/ontology/foundedBy"][0]["value"]
return data
def create_institution_profile(institution_name: str) -> InstitutionProfile:
data = fetch_wikipedia_data(institution_name)
profile = InstitutionProfile(
founder=data["founder"],
Established=data["Established"],
branches=data["branches"],
employee_count=data["employee_count"],
summary=data["summary"],
)
return profile
if __name__ == "__main__":
institution_name = input("Enter institution name: ")
try:
profile = create_institution_profile(institution_name)
print(profile.model_dump_json(indent=2))
except ValueError as e:
print(e)OUTPUT:
Enter institution name: Stanford University
{
"founder": "Leland and Jane Stanford",
"Established": 1885,
"branches": [
"Stanford"
],
"employee_count": 0,
"summary": "Leland Stanford Junior University, commonly referred to as Stanford University, is a private research university in Stanford, California, United States. It was founded in 1885 by railroad magnate Leland Stanford (the eighth governor of and then-incumbent United States senator representing California"
}