Jsoup return Status 400 error on some links but not others on the same page
So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.
import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;
import java.io.*; // Only needed if scraping a local File.
public class Scraper {
public Scraper() {
org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();
// get the page title
String title = page.title();
System.out.println("title: " + title);
// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();
//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());
}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}
public static void main (String args) {
new Scraper();
}
}
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"
jsoup
add a comment |
So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.
import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;
import java.io.*; // Only needed if scraping a local File.
public class Scraper {
public Scraper() {
org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();
// get the page title
String title = page.title();
System.out.println("title: " + title);
// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();
//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());
}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}
public static void main (String args) {
new Scraper();
}
}
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"
jsoup
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?
– Eritrean
Dec 5 '18 at 17:07
add a comment |
So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.
import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;
import java.io.*; // Only needed if scraping a local File.
public class Scraper {
public Scraper() {
org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();
// get the page title
String title = page.title();
System.out.println("title: " + title);
// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();
//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());
}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}
public static void main (String args) {
new Scraper();
}
}
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"
jsoup
So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.
import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;
import java.io.*; // Only needed if scraping a local File.
public class Scraper {
public Scraper() {
org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();
// get the page title
String title = page.title();
System.out.println("title: " + title);
// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();
//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());
}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}
public static void main (String args) {
new Scraper();
}
}
org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"
jsoup
jsoup
edited Nov 27 '18 at 1:07
bob wonch
asked Nov 26 '18 at 23:52
bob wonchbob wonch
35
35
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?
– Eritrean
Dec 5 '18 at 17:07
add a comment |
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?
– Eritrean
Dec 5 '18 at 17:07
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.
String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?– Eritrean
Dec 5 '18 at 17:07
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.
String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?– Eritrean
Dec 5 '18 at 17:07
add a comment |
1 Answer
1
active
oldest
votes
I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:
- Get the first page and select the table with the prefix links from A to Z
- For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT
- For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999
Your code could be something like :
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490821%2fjsoup-return-status-400-error-on-some-links-but-not-others-on-the-same-page%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:
- Get the first page and select the table with the prefix links from A to Z
- For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT
- For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999
Your code could be something like :
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
add a comment |
I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:
- Get the first page and select the table with the prefix links from A to Z
- For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT
- For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999
Your code could be something like :
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
add a comment |
I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:
- Get the first page and select the table with the prefix links from A to Z
- For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT
- For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999
Your code could be something like :
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:
- Get the first page and select the table with the prefix links from A to Z
- For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT
- For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999
Your code could be something like :
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";
public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");
for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
edited Dec 4 '18 at 13:44
answered Dec 4 '18 at 11:12
EritreanEritrean
3,5021914
3,5021914
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
add a comment |
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.
– Eritrean
Dec 4 '18 at 14:07
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.
– bob wonch
Dec 5 '18 at 16:14
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I ended up fixing it via spliting the string and stplitting for a few specific cases.
– bob wonch
Dec 5 '18 at 16:15
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.
– Eritrean
Dec 5 '18 at 16:38
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490821%2fjsoup-return-status-400-error-on-some-links-but-not-others-on-the-same-page%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e.
String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko";
Jsoup.connect(someUrl).userAgent(userAgent).get();
?– Eritrean
Dec 5 '18 at 17:07