Python: BeautifulSoup Scrape, Blank Descriptions For Courses Messing Up Data
I'm trying to scrape some course data from the site https://bulletins.psu.edu/university-course-descriptions/undergraduate/ for a project.
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 5 20:37:33 2018
@author: DazedFury
"""
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
import requests
# returns a CloudflareScraper instance
#scraper = cfscrape.create_scraper()
#URL and textfile
text_file = open("Output.txt", "w", encoding='UTF-8')
page_link = 'https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/'
page_response = requests.get(page_link)
page_content = BeautifulSoup(page_response.content, "html.parser")
#Array for storing URL's
URLArray =
#Find links
for link in page_content.find_all('a'):
if('/university-course-descriptions/undergraduate' in link.get('href')):
URLArray.append(link.get('href'))
k = 1
#Parse Loop
while(k != 242):
print("Writing " + str(k))
completeURL = 'https://bulletins.psu.edu' + URLArray[k]
# this is the url that we've already determined is safe and legal to scrape from.
page_link = completeURL
# here, we fetch the content from the url, using the requests library
page_response = requests.get(page_link)
#we use the html parser to parse the url content and store it in a variable.
page_content = BeautifulSoup(page_response.content, "html.parser")
page_content.prettify
#Find and print all text with tag p
paragraphs = page_content.find_all('div', {'class' : 'course_codetitle'})
paragraphs2 = page_content.find_all('div', {'class' : 'courseblockdesc'})
j = 0
for i in range(len(paragraphs)):
if i % 2 == 0:
text_file.write(paragraphs[i].get_text())
text_file.write("n")
if j < len(paragraphs2):
text_file.write(" ".join(paragraphs2[j].get_text().split()))
text_file.write("n")
text_file.write("n")
if(paragraphs2[j].get_text() != ""):
j += 1
k += 1
#FORMAT
#text_file.write("<p style="page-break-after: always;"> </p>")
#text_file.write("nn")
#Close Text File
text_file.close()
The specific info I need are the course title and the description. The problem is that some of the courses have blank descriptions, which messes up the order and giving bad data.
I thought about just checking if the course description is blank but on the site, the 'courseblockdesc' tag doesn't exists if the course has no description. Therefore when I find_all courseblockdesc, the list doesn't actually add add an element to the array, so the order ends up messed up. There are too many errors on this to manually fix, so I was hoping someone could help me find a solution to this.
python web-scraping beautifulsoup
add a comment |
I'm trying to scrape some course data from the site https://bulletins.psu.edu/university-course-descriptions/undergraduate/ for a project.
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 5 20:37:33 2018
@author: DazedFury
"""
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
import requests
# returns a CloudflareScraper instance
#scraper = cfscrape.create_scraper()
#URL and textfile
text_file = open("Output.txt", "w", encoding='UTF-8')
page_link = 'https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/'
page_response = requests.get(page_link)
page_content = BeautifulSoup(page_response.content, "html.parser")
#Array for storing URL's
URLArray =
#Find links
for link in page_content.find_all('a'):
if('/university-course-descriptions/undergraduate' in link.get('href')):
URLArray.append(link.get('href'))
k = 1
#Parse Loop
while(k != 242):
print("Writing " + str(k))
completeURL = 'https://bulletins.psu.edu' + URLArray[k]
# this is the url that we've already determined is safe and legal to scrape from.
page_link = completeURL
# here, we fetch the content from the url, using the requests library
page_response = requests.get(page_link)
#we use the html parser to parse the url content and store it in a variable.
page_content = BeautifulSoup(page_response.content, "html.parser")
page_content.prettify
#Find and print all text with tag p
paragraphs = page_content.find_all('div', {'class' : 'course_codetitle'})
paragraphs2 = page_content.find_all('div', {'class' : 'courseblockdesc'})
j = 0
for i in range(len(paragraphs)):
if i % 2 == 0:
text_file.write(paragraphs[i].get_text())
text_file.write("n")
if j < len(paragraphs2):
text_file.write(" ".join(paragraphs2[j].get_text().split()))
text_file.write("n")
text_file.write("n")
if(paragraphs2[j].get_text() != ""):
j += 1
k += 1
#FORMAT
#text_file.write("<p style="page-break-after: always;"> </p>")
#text_file.write("nn")
#Close Text File
text_file.close()
The specific info I need are the course title and the description. The problem is that some of the courses have blank descriptions, which messes up the order and giving bad data.
I thought about just checking if the course description is blank but on the site, the 'courseblockdesc' tag doesn't exists if the course has no description. Therefore when I find_all courseblockdesc, the list doesn't actually add add an element to the array, so the order ends up messed up. There are too many errors on this to manually fix, so I was hoping someone could help me find a solution to this.
python web-scraping beautifulsoup
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32
add a comment |
I'm trying to scrape some course data from the site https://bulletins.psu.edu/university-course-descriptions/undergraduate/ for a project.
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 5 20:37:33 2018
@author: DazedFury
"""
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
import requests
# returns a CloudflareScraper instance
#scraper = cfscrape.create_scraper()
#URL and textfile
text_file = open("Output.txt", "w", encoding='UTF-8')
page_link = 'https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/'
page_response = requests.get(page_link)
page_content = BeautifulSoup(page_response.content, "html.parser")
#Array for storing URL's
URLArray =
#Find links
for link in page_content.find_all('a'):
if('/university-course-descriptions/undergraduate' in link.get('href')):
URLArray.append(link.get('href'))
k = 1
#Parse Loop
while(k != 242):
print("Writing " + str(k))
completeURL = 'https://bulletins.psu.edu' + URLArray[k]
# this is the url that we've already determined is safe and legal to scrape from.
page_link = completeURL
# here, we fetch the content from the url, using the requests library
page_response = requests.get(page_link)
#we use the html parser to parse the url content and store it in a variable.
page_content = BeautifulSoup(page_response.content, "html.parser")
page_content.prettify
#Find and print all text with tag p
paragraphs = page_content.find_all('div', {'class' : 'course_codetitle'})
paragraphs2 = page_content.find_all('div', {'class' : 'courseblockdesc'})
j = 0
for i in range(len(paragraphs)):
if i % 2 == 0:
text_file.write(paragraphs[i].get_text())
text_file.write("n")
if j < len(paragraphs2):
text_file.write(" ".join(paragraphs2[j].get_text().split()))
text_file.write("n")
text_file.write("n")
if(paragraphs2[j].get_text() != ""):
j += 1
k += 1
#FORMAT
#text_file.write("<p style="page-break-after: always;"> </p>")
#text_file.write("nn")
#Close Text File
text_file.close()
The specific info I need are the course title and the description. The problem is that some of the courses have blank descriptions, which messes up the order and giving bad data.
I thought about just checking if the course description is blank but on the site, the 'courseblockdesc' tag doesn't exists if the course has no description. Therefore when I find_all courseblockdesc, the list doesn't actually add add an element to the array, so the order ends up messed up. There are too many errors on this to manually fix, so I was hoping someone could help me find a solution to this.
python web-scraping beautifulsoup
I'm trying to scrape some course data from the site https://bulletins.psu.edu/university-course-descriptions/undergraduate/ for a project.
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 5 20:37:33 2018
@author: DazedFury
"""
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
import requests
# returns a CloudflareScraper instance
#scraper = cfscrape.create_scraper()
#URL and textfile
text_file = open("Output.txt", "w", encoding='UTF-8')
page_link = 'https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/'
page_response = requests.get(page_link)
page_content = BeautifulSoup(page_response.content, "html.parser")
#Array for storing URL's
URLArray =
#Find links
for link in page_content.find_all('a'):
if('/university-course-descriptions/undergraduate' in link.get('href')):
URLArray.append(link.get('href'))
k = 1
#Parse Loop
while(k != 242):
print("Writing " + str(k))
completeURL = 'https://bulletins.psu.edu' + URLArray[k]
# this is the url that we've already determined is safe and legal to scrape from.
page_link = completeURL
# here, we fetch the content from the url, using the requests library
page_response = requests.get(page_link)
#we use the html parser to parse the url content and store it in a variable.
page_content = BeautifulSoup(page_response.content, "html.parser")
page_content.prettify
#Find and print all text with tag p
paragraphs = page_content.find_all('div', {'class' : 'course_codetitle'})
paragraphs2 = page_content.find_all('div', {'class' : 'courseblockdesc'})
j = 0
for i in range(len(paragraphs)):
if i % 2 == 0:
text_file.write(paragraphs[i].get_text())
text_file.write("n")
if j < len(paragraphs2):
text_file.write(" ".join(paragraphs2[j].get_text().split()))
text_file.write("n")
text_file.write("n")
if(paragraphs2[j].get_text() != ""):
j += 1
k += 1
#FORMAT
#text_file.write("<p style="page-break-after: always;"> </p>")
#text_file.write("nn")
#Close Text File
text_file.close()
The specific info I need are the course title and the description. The problem is that some of the courses have blank descriptions, which messes up the order and giving bad data.
I thought about just checking if the course description is blank but on the site, the 'courseblockdesc' tag doesn't exists if the course has no description. Therefore when I find_all courseblockdesc, the list doesn't actually add add an element to the array, so the order ends up messed up. There are too many errors on this to manually fix, so I was hoping someone could help me find a solution to this.
python web-scraping beautifulsoup
python web-scraping beautifulsoup
edited Nov 26 '18 at 23:57
ggorlen
7,1883825
7,1883825
asked Nov 26 '18 at 23:51
DazedFuryDazedFury
276
276
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32
add a comment |
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32
add a comment |
2 Answers
2
active
oldest
votes
The simplest solution would be to go through each item in one find_all for the parents of the items you are looking for.
for block in page_content.find_all('div', class_="courseblock"):
title = block.find('div', {'class' : 'course_codetitle'})
description = block.find('div', {'class' : 'courseblockdesc'})
# do what you need with the navigable strings here.
print(title.get_text()
if description:
print(description.get_text())
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
add a comment |
You may be over-complicating the procedure somewhat, but you're certainly on the right track. Instead of storing the information in an array and relying on all of the indexes to line up, write the text file as you traverse the courses, pulling title and description dynamically from each course block. If a block doesn't have a description, you can handle that on the spot. Here's a working example:
from bs4 import BeautifulSoup
import requests
url = "https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/"
with open("out.txt", "w", encoding="UTF-8") as f:
for link in BeautifulSoup(requests.get(url).content, "html.parser").find_all("a"):
if "/university-course-descriptions/undergraduate" in link["href"]:
soup = BeautifulSoup(requests.get("https://bulletins.psu.edu" + link["href"]).content, "html.parser")
for course in soup.find_all("div", {"class": "courseblock"}):
title = course.find("div", {"class" : "course_title"}).get_text().strip()
try:
desc = course.find("div", {"class" : "courseblockdesc"}).get_text().strip()
except AttributeError:
desc = "No description available"
f.write(title + "n" + desc + "nn")
Output snippet (from end of text file to validate alignment):
WLED 495: **SPECIAL TOPICS**
No description available
WLED 495B: Field Experience for World Languages Teacher Preparation in Grades 1-5
WL ED 495B Field Experience for World Languages Teacher Preparation in Grades 1-5 (3) Practicum situation where Prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with children in grades 1-5 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluated own designed activities and lessons; (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events; (3) inquiry projects on teaching and learning of World Languages.
WLED 495C: Field Experience for World Languages Teacher Preparation in Grades 6-12
WL ED 495C Field Experience for World Languages Teacher Preparation in Grades 6-12 (3) Practicum situation where prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements in grades 6-12 and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with students in grades 6-12 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluating their own designed activities and lessons, (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events, and (3) inquiry projects on teaching and learning of World Languages.
Additional minor remarks:
It's a good idea to use the
with
keyword for file I/O. This will automatically close the file handle when done.Verbose intermediate variables and comments that add noise like:
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
or
#Close Text File
text_file.close()
can always be removed, making the program logic easier to follow.
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490816%2fpython-beautifulsoup-scrape-blank-descriptions-for-courses-messing-up-data%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
The simplest solution would be to go through each item in one find_all for the parents of the items you are looking for.
for block in page_content.find_all('div', class_="courseblock"):
title = block.find('div', {'class' : 'course_codetitle'})
description = block.find('div', {'class' : 'courseblockdesc'})
# do what you need with the navigable strings here.
print(title.get_text()
if description:
print(description.get_text())
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
add a comment |
The simplest solution would be to go through each item in one find_all for the parents of the items you are looking for.
for block in page_content.find_all('div', class_="courseblock"):
title = block.find('div', {'class' : 'course_codetitle'})
description = block.find('div', {'class' : 'courseblockdesc'})
# do what you need with the navigable strings here.
print(title.get_text()
if description:
print(description.get_text())
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
add a comment |
The simplest solution would be to go through each item in one find_all for the parents of the items you are looking for.
for block in page_content.find_all('div', class_="courseblock"):
title = block.find('div', {'class' : 'course_codetitle'})
description = block.find('div', {'class' : 'courseblockdesc'})
# do what you need with the navigable strings here.
print(title.get_text()
if description:
print(description.get_text())
The simplest solution would be to go through each item in one find_all for the parents of the items you are looking for.
for block in page_content.find_all('div', class_="courseblock"):
title = block.find('div', {'class' : 'course_codetitle'})
description = block.find('div', {'class' : 'courseblockdesc'})
# do what you need with the navigable strings here.
print(title.get_text()
if description:
print(description.get_text())
answered Nov 27 '18 at 0:06
![](https://i.stack.imgur.com/kHor0.jpg?s=32&g=1)
![](https://i.stack.imgur.com/kHor0.jpg?s=32&g=1)
B.AdlerB.Adler
935916
935916
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
add a comment |
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
Worked perfect!
– DazedFury
Nov 27 '18 at 0:23
add a comment |
You may be over-complicating the procedure somewhat, but you're certainly on the right track. Instead of storing the information in an array and relying on all of the indexes to line up, write the text file as you traverse the courses, pulling title and description dynamically from each course block. If a block doesn't have a description, you can handle that on the spot. Here's a working example:
from bs4 import BeautifulSoup
import requests
url = "https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/"
with open("out.txt", "w", encoding="UTF-8") as f:
for link in BeautifulSoup(requests.get(url).content, "html.parser").find_all("a"):
if "/university-course-descriptions/undergraduate" in link["href"]:
soup = BeautifulSoup(requests.get("https://bulletins.psu.edu" + link["href"]).content, "html.parser")
for course in soup.find_all("div", {"class": "courseblock"}):
title = course.find("div", {"class" : "course_title"}).get_text().strip()
try:
desc = course.find("div", {"class" : "courseblockdesc"}).get_text().strip()
except AttributeError:
desc = "No description available"
f.write(title + "n" + desc + "nn")
Output snippet (from end of text file to validate alignment):
WLED 495: **SPECIAL TOPICS**
No description available
WLED 495B: Field Experience for World Languages Teacher Preparation in Grades 1-5
WL ED 495B Field Experience for World Languages Teacher Preparation in Grades 1-5 (3) Practicum situation where Prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with children in grades 1-5 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluated own designed activities and lessons; (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events; (3) inquiry projects on teaching and learning of World Languages.
WLED 495C: Field Experience for World Languages Teacher Preparation in Grades 6-12
WL ED 495C Field Experience for World Languages Teacher Preparation in Grades 6-12 (3) Practicum situation where prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements in grades 6-12 and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with students in grades 6-12 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluating their own designed activities and lessons, (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events, and (3) inquiry projects on teaching and learning of World Languages.
Additional minor remarks:
It's a good idea to use the
with
keyword for file I/O. This will automatically close the file handle when done.Verbose intermediate variables and comments that add noise like:
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
or
#Close Text File
text_file.close()
can always be removed, making the program logic easier to follow.
add a comment |
You may be over-complicating the procedure somewhat, but you're certainly on the right track. Instead of storing the information in an array and relying on all of the indexes to line up, write the text file as you traverse the courses, pulling title and description dynamically from each course block. If a block doesn't have a description, you can handle that on the spot. Here's a working example:
from bs4 import BeautifulSoup
import requests
url = "https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/"
with open("out.txt", "w", encoding="UTF-8") as f:
for link in BeautifulSoup(requests.get(url).content, "html.parser").find_all("a"):
if "/university-course-descriptions/undergraduate" in link["href"]:
soup = BeautifulSoup(requests.get("https://bulletins.psu.edu" + link["href"]).content, "html.parser")
for course in soup.find_all("div", {"class": "courseblock"}):
title = course.find("div", {"class" : "course_title"}).get_text().strip()
try:
desc = course.find("div", {"class" : "courseblockdesc"}).get_text().strip()
except AttributeError:
desc = "No description available"
f.write(title + "n" + desc + "nn")
Output snippet (from end of text file to validate alignment):
WLED 495: **SPECIAL TOPICS**
No description available
WLED 495B: Field Experience for World Languages Teacher Preparation in Grades 1-5
WL ED 495B Field Experience for World Languages Teacher Preparation in Grades 1-5 (3) Practicum situation where Prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with children in grades 1-5 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluated own designed activities and lessons; (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events; (3) inquiry projects on teaching and learning of World Languages.
WLED 495C: Field Experience for World Languages Teacher Preparation in Grades 6-12
WL ED 495C Field Experience for World Languages Teacher Preparation in Grades 6-12 (3) Practicum situation where prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements in grades 6-12 and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with students in grades 6-12 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluating their own designed activities and lessons, (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events, and (3) inquiry projects on teaching and learning of World Languages.
Additional minor remarks:
It's a good idea to use the
with
keyword for file I/O. This will automatically close the file handle when done.Verbose intermediate variables and comments that add noise like:
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
or
#Close Text File
text_file.close()
can always be removed, making the program logic easier to follow.
add a comment |
You may be over-complicating the procedure somewhat, but you're certainly on the right track. Instead of storing the information in an array and relying on all of the indexes to line up, write the text file as you traverse the courses, pulling title and description dynamically from each course block. If a block doesn't have a description, you can handle that on the spot. Here's a working example:
from bs4 import BeautifulSoup
import requests
url = "https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/"
with open("out.txt", "w", encoding="UTF-8") as f:
for link in BeautifulSoup(requests.get(url).content, "html.parser").find_all("a"):
if "/university-course-descriptions/undergraduate" in link["href"]:
soup = BeautifulSoup(requests.get("https://bulletins.psu.edu" + link["href"]).content, "html.parser")
for course in soup.find_all("div", {"class": "courseblock"}):
title = course.find("div", {"class" : "course_title"}).get_text().strip()
try:
desc = course.find("div", {"class" : "courseblockdesc"}).get_text().strip()
except AttributeError:
desc = "No description available"
f.write(title + "n" + desc + "nn")
Output snippet (from end of text file to validate alignment):
WLED 495: **SPECIAL TOPICS**
No description available
WLED 495B: Field Experience for World Languages Teacher Preparation in Grades 1-5
WL ED 495B Field Experience for World Languages Teacher Preparation in Grades 1-5 (3) Practicum situation where Prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with children in grades 1-5 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluated own designed activities and lessons; (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events; (3) inquiry projects on teaching and learning of World Languages.
WLED 495C: Field Experience for World Languages Teacher Preparation in Grades 6-12
WL ED 495C Field Experience for World Languages Teacher Preparation in Grades 6-12 (3) Practicum situation where prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements in grades 6-12 and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with students in grades 6-12 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluating their own designed activities and lessons, (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events, and (3) inquiry projects on teaching and learning of World Languages.
Additional minor remarks:
It's a good idea to use the
with
keyword for file I/O. This will automatically close the file handle when done.Verbose intermediate variables and comments that add noise like:
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
or
#Close Text File
text_file.close()
can always be removed, making the program logic easier to follow.
You may be over-complicating the procedure somewhat, but you're certainly on the right track. Instead of storing the information in an array and relying on all of the indexes to line up, write the text file as you traverse the courses, pulling title and description dynamically from each course block. If a block doesn't have a description, you can handle that on the spot. Here's a working example:
from bs4 import BeautifulSoup
import requests
url = "https://bulletins.psu.edu/university-course-descriptions/undergraduate/acctg/"
with open("out.txt", "w", encoding="UTF-8") as f:
for link in BeautifulSoup(requests.get(url).content, "html.parser").find_all("a"):
if "/university-course-descriptions/undergraduate" in link["href"]:
soup = BeautifulSoup(requests.get("https://bulletins.psu.edu" + link["href"]).content, "html.parser")
for course in soup.find_all("div", {"class": "courseblock"}):
title = course.find("div", {"class" : "course_title"}).get_text().strip()
try:
desc = course.find("div", {"class" : "courseblockdesc"}).get_text().strip()
except AttributeError:
desc = "No description available"
f.write(title + "n" + desc + "nn")
Output snippet (from end of text file to validate alignment):
WLED 495: **SPECIAL TOPICS**
No description available
WLED 495B: Field Experience for World Languages Teacher Preparation in Grades 1-5
WL ED 495B Field Experience for World Languages Teacher Preparation in Grades 1-5 (3) Practicum situation where Prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with children in grades 1-5 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluated own designed activities and lessons; (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events; (3) inquiry projects on teaching and learning of World Languages.
WLED 495C: Field Experience for World Languages Teacher Preparation in Grades 6-12
WL ED 495C Field Experience for World Languages Teacher Preparation in Grades 6-12 (3) Practicum situation where prospective World Language teachers will demonstrate acquired knowledge on second language learning/teaching and educational theories. Prospective World Language teachers will have assigned school placements in grades 6-12 and will attend a weekly seminar where issues in World Language learning and teaching will be discussed. At their assigned school placement, prospective World Language teachers will have many opportunities to observe/work with students in grades 6-12 (1) focusing on second language learning/teaching and the socio/cultural issues associated to classroom practices while implementing and self-evaluating their own designed activities and lessons, (2) weekly seminars will engage students in reflective activities that will enable them to analyze each week's events, and (3) inquiry projects on teaching and learning of World Languages.
Additional minor remarks:
It's a good idea to use the
with
keyword for file I/O. This will automatically close the file handle when done.Verbose intermediate variables and comments that add noise like:
# Here, we're just importing both Beautiful Soup and the Requests library
from bs4 import BeautifulSoup
or
#Close Text File
text_file.close()
can always be removed, making the program logic easier to follow.
edited Nov 27 '18 at 0:30
answered Nov 27 '18 at 0:23
ggorlenggorlen
7,1883825
7,1883825
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490816%2fpython-beautifulsoup-scrape-blank-descriptions-for-courses-messing-up-data%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
You initialize k = 1 but indexing starts from 0
– Amine Messaoudi
Nov 27 '18 at 0:01
@AmineMessaoudi Right, I did this because in the list of links I pull, the first one is a link that I don't need. I just decided to start at one to skip that link and keep it simple.
– DazedFury
Nov 27 '18 at 0:13
Do you still need the title if no description?
– QHarr
Nov 27 '18 at 10:32