Jsoup return Status 400 error on some links but not others on the same page












0















So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.



    import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;

import java.io.*; // Only needed if scraping a local File.

public class Scraper {


public Scraper() {

org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();

// get the page title
String title = page.title();
System.out.println("title: " + title);

// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();


//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());

}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}

public static void main (String args) {

new Scraper();
}

}



org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"











share|improve this question

























  • I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

    – Eritrean
    Dec 5 '18 at 17:07
















0















So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.



    import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;

import java.io.*; // Only needed if scraping a local File.

public class Scraper {


public Scraper() {

org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();

// get the page title
String title = page.title();
System.out.println("title: " + title);

// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();


//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());

}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}

public static void main (String args) {

new Scraper();
}

}



org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"











share|improve this question

























  • I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

    – Eritrean
    Dec 5 '18 at 17:07














0












0








0








So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.



    import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;

import java.io.*; // Only needed if scraping a local File.

public class Scraper {


public Scraper() {

org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();

// get the page title
String title = page.title();
System.out.println("title: " + title);

// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();


//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());

}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}

public static void main (String args) {

new Scraper();
}

}



org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"











share|improve this question
















So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.



    import org.jsoup.*;
import org.jsoup.helper.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.w3c.dom.Document;

import java.io.*; // Only needed if scraping a local File.

public class Scraper {


public Scraper() {

org.jsoup.nodes.Document page = null;
org.jsoup.nodes.Document prefix = null;
org.jsoup.nodes.Document course = null;
org.jsoup.nodes.Document cls = null;
try {
page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();

// get the page title
String title = page.title();
System.out.println("title: " + title);

// get all links in page
Elements links = page.select("a[href]");
for (Element link : links) {
if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))
{
prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();
Elements links2 = prefix.select("a[href]");
for (Element link2 : links2) {
if(link2.text().matches("[A-Z]{3}"))
{
course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();
Elements links3 = course.select("a[href]");
for (Element link3 : links3) {
if(link3.text().matches("[A-Z]{3} [0-9]{3}"))
{
cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();
Elements links4 = cls.getAllElements();


//Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");
System.out.println("nhref = " + link3.attr("href") + "n" + cls.text() + "n");
System.out.println("link: " + cls.tagName());
System.out.println("Course Number: " + link3.data().toString());

}
}
}
}
}
}
}catch (IOException ioe) {
ioe.printStackTrace();
}
}

public static void main (String args) {

new Scraper();
}

}



org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"








jsoup






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Nov 27 '18 at 1:07







bob wonch

















asked Nov 26 '18 at 23:52









bob wonchbob wonch

35




35













  • I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

    – Eritrean
    Dec 5 '18 at 17:07



















  • I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

    – Eritrean
    Dec 5 '18 at 17:07

















I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07





I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07












1 Answer
1






active

oldest

votes


















0














I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:




  1. Get the first page and select the table with the prefix links from A to Z

  2. For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

  3. For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999


Your code could be something like :



import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";

public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}

} catch (IOException ex) {
ex.printStackTrace();
}
}
}





share|improve this answer


























  • Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

    – Eritrean
    Dec 4 '18 at 14:07











  • Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

    – bob wonch
    Dec 5 '18 at 16:14













  • I ended up fixing it via spliting the string and stplitting for a few specific cases.

    – bob wonch
    Dec 5 '18 at 16:15











  • I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

    – Eritrean
    Dec 5 '18 at 16:38











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490821%2fjsoup-return-status-400-error-on-some-links-but-not-others-on-the-same-page%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes









0














I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:




  1. Get the first page and select the table with the prefix links from A to Z

  2. For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

  3. For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999


Your code could be something like :



import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";

public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}

} catch (IOException ex) {
ex.printStackTrace();
}
}
}





share|improve this answer


























  • Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

    – Eritrean
    Dec 4 '18 at 14:07











  • Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

    – bob wonch
    Dec 5 '18 at 16:14













  • I ended up fixing it via spliting the string and stplitting for a few specific cases.

    – bob wonch
    Dec 5 '18 at 16:15











  • I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

    – Eritrean
    Dec 5 '18 at 16:38
















0














I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:




  1. Get the first page and select the table with the prefix links from A to Z

  2. For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

  3. For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999


Your code could be something like :



import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";

public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}

} catch (IOException ex) {
ex.printStackTrace();
}
}
}





share|improve this answer


























  • Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

    – Eritrean
    Dec 4 '18 at 14:07











  • Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

    – bob wonch
    Dec 5 '18 at 16:14













  • I ended up fixing it via spliting the string and stplitting for a few specific cases.

    – bob wonch
    Dec 5 '18 at 16:15











  • I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

    – Eritrean
    Dec 5 '18 at 16:38














0












0








0







I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:




  1. Get the first page and select the table with the prefix links from A to Z

  2. For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

  3. For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999


Your code could be something like :



import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";

public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}

} catch (IOException ex) {
ex.printStackTrace();
}
}
}





share|improve this answer















I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:




  1. Get the first page and select the table with the prefix links from A to Z

  2. For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

  3. For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999


Your code could be something like :



import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class JsoupTest {
final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"
+ "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";

public static void main(String args) {
try {
Document page = Jsoup.connect(homePage).get();
Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element prefix : prefixLinks){
Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();
Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element coursePrefix : coursePrefixLinks){
Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();
Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");

for(Element course : courseLinks){
Document coursePage = Jsoup.connect(course.absUrl("href")).get();
try{
String courseNr = coursePage.select("tr:contains(Course:)").first().text();
String courseTitle = coursePage.select("tr:contains(Title:)").first().text();
System.out.println(courseNr +"n" + courseTitle + "n********************");
}
catch(NullPointerException np){
System.out.println("Broken link, Page not Found");
}
}
}
}

} catch (IOException ex) {
ex.printStackTrace();
}
}
}






share|improve this answer














share|improve this answer



share|improve this answer








edited Dec 4 '18 at 13:44

























answered Dec 4 '18 at 11:12









EritreanEritrean

3,5021914




3,5021914













  • Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

    – Eritrean
    Dec 4 '18 at 14:07











  • Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

    – bob wonch
    Dec 5 '18 at 16:14













  • I ended up fixing it via spliting the string and stplitting for a few specific cases.

    – bob wonch
    Dec 5 '18 at 16:15











  • I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

    – Eritrean
    Dec 5 '18 at 16:38



















  • Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

    – Eritrean
    Dec 4 '18 at 14:07











  • Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

    – bob wonch
    Dec 5 '18 at 16:14













  • I ended up fixing it via spliting the string and stplitting for a few specific cases.

    – bob wonch
    Dec 5 '18 at 16:15











  • I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

    – Eritrean
    Dec 5 '18 at 16:38

















Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07





Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07













Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14







Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14















I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15





I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15













I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38





I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38




















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490821%2fjsoup-return-status-400-error-on-some-links-but-not-others-on-the-same-page%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

A CLEAN and SIMPLE way to add appendices to Table of Contents and bookmarks

Calculate evaluation metrics using cross_val_predict sklearn

Insert data from modal to MySQL (multiple modal on website)