Jsoup return Status 400 error on some links but not others on the same page

So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.

    import org.jsoup.*;

    import org.jsoup.helper.*;

    import org.jsoup.nodes.*;

    import org.jsoup.select.*;

    import org.w3c.dom.Document;



    import java.io.*; // Only needed if scraping a local File.



    public class Scraper {





        public Scraper() {



            org.jsoup.nodes.Document page = null;

            org.jsoup.nodes.Document prefix = null;

            org.jsoup.nodes.Document course = null;

            org.jsoup.nodes.Document cls = null;

            try {

                page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();



            // get the page title

            String title = page.title();

            System.out.println("title: " + title);



            // get all links in page

            Elements links = page.select("a[href]");

            for (Element link : links) {

                if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))

                { 

                    prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();

                    Elements links2 = prefix.select("a[href]");

                    for (Element link2 : links2) {

                        if(link2.text().matches("[A-Z]{3}"))

                        {

                            course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();

                            Elements links3 = course.select("a[href]");

                            for (Element link3 : links3) {

                                if(link3.text().matches("[A-Z]{3} [0-9]{3}"))

                                {

                                    cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();

                                    Elements links4 = cls.getAllElements();





                                    //Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");

                                    System.out.println("nhref = " + link3.attr("href") + "n" +  cls.text() + "n");

                                    System.out.println("link: " + cls.tagName());

                                    System.out.println("Course Number: " + link3.data().toString());



                                }

                            }

                        }

                    }

                }

            }

        }catch (IOException ioe) {

            ioe.printStackTrace();

        }

    }



    public static void main (String args) {



        new Scraper();

    }



}

org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07

add a comment |

So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.

    import org.jsoup.*;

    import org.jsoup.helper.*;

    import org.jsoup.nodes.*;

    import org.jsoup.select.*;

    import org.w3c.dom.Document;



    import java.io.*; // Only needed if scraping a local File.



    public class Scraper {





        public Scraper() {



            org.jsoup.nodes.Document page = null;

            org.jsoup.nodes.Document prefix = null;

            org.jsoup.nodes.Document course = null;

            org.jsoup.nodes.Document cls = null;

            try {

                page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();



            // get the page title

            String title = page.title();

            System.out.println("title: " + title);



            // get all links in page

            Elements links = page.select("a[href]");

            for (Element link : links) {

                if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))

                { 

                    prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();

                    Elements links2 = prefix.select("a[href]");

                    for (Element link2 : links2) {

                        if(link2.text().matches("[A-Z]{3}"))

                        {

                            course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();

                            Elements links3 = course.select("a[href]");

                            for (Element link3 : links3) {

                                if(link3.text().matches("[A-Z]{3} [0-9]{3}"))

                                {

                                    cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();

                                    Elements links4 = cls.getAllElements();





                                    //Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");

                                    System.out.println("nhref = " + link3.attr("href") + "n" +  cls.text() + "n");

                                    System.out.println("link: " + cls.tagName());

                                    System.out.println("Course Number: " + link3.data().toString());



                                }

                            }

                        }

                    }

                }

            }

        }catch (IOException ioe) {

            ioe.printStackTrace();

        }

    }



    public static void main (String args) {



        new Scraper();

    }



}

org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07

add a comment |

So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.

    import org.jsoup.*;

    import org.jsoup.helper.*;

    import org.jsoup.nodes.*;

    import org.jsoup.select.*;

    import org.w3c.dom.Document;



    import java.io.*; // Only needed if scraping a local File.



    public class Scraper {





        public Scraper() {



            org.jsoup.nodes.Document page = null;

            org.jsoup.nodes.Document prefix = null;

            org.jsoup.nodes.Document course = null;

            org.jsoup.nodes.Document cls = null;

            try {

                page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();



            // get the page title

            String title = page.title();

            System.out.println("title: " + title);



            // get all links in page

            Elements links = page.select("a[href]");

            for (Element link : links) {

                if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))

                { 

                    prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();

                    Elements links2 = prefix.select("a[href]");

                    for (Element link2 : links2) {

                        if(link2.text().matches("[A-Z]{3}"))

                        {

                            course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();

                            Elements links3 = course.select("a[href]");

                            for (Element link3 : links3) {

                                if(link3.text().matches("[A-Z]{3} [0-9]{3}"))

                                {

                                    cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();

                                    Elements links4 = cls.getAllElements();





                                    //Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");

                                    System.out.println("nhref = " + link3.attr("href") + "n" +  cls.text() + "n");

                                    System.out.println("link: " + cls.tagName());

                                    System.out.println("Course Number: " + link3.data().toString());



                                }

                            }

                        }

                    }

                }

            }

        }catch (IOException ioe) {

            ioe.printStackTrace();

        }

    }



    public static void main (String args) {



        new Scraper();

    }



}

org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

So weird I get the results I need for the first 12 or so links then it crashes on the same link (AAA 090) every time. Im' not sure what the difference is between the links.

    import org.jsoup.*;

    import org.jsoup.helper.*;

    import org.jsoup.nodes.*;

    import org.jsoup.select.*;

    import org.w3c.dom.Document;



    import java.io.*; // Only needed if scraping a local File.



    public class Scraper {





        public Scraper() {



            org.jsoup.nodes.Document page = null;

            org.jsoup.nodes.Document prefix = null;

            org.jsoup.nodes.Document course = null;

            org.jsoup.nodes.Document cls = null;

            try {

                page = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX").get();



            // get the page title

            String title = page.title();

            System.out.println("title: " + title);



            // get all links in page

            Elements links = page.select("a[href]");

            for (Element link : links) {

                if(!link.text().equalsIgnoreCase("HELP")&&!link.text().equalsIgnoreCase("Exit"))

                { 

                    prefix = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link.attr("href")).get();

                    Elements links2 = prefix.select("a[href]");

                    for (Element link2 : links2) {

                        if(link2.text().matches("[A-Z]{3}"))

                        {

                            course = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link2.attr("href")).get();

                            Elements links3 = course.select("a[href]");

                            for (Element link3 : links3) {

                                if(link3.text().matches("[A-Z]{3} [0-9]{3}"))

                                {

                                    cls = Jsoup.connect("https://erpdnssb.cccs.edu/PRODCCCS/" + link3.attr("href")).get();

                                    Elements links4 = cls.getAllElements();





                                    //Elements courseNum = link4.select("body > div.pagebodydiv > table:nth-child(4) > tbody > tr:nth-child(1)");

                                    System.out.println("nhref = " + link3.attr("href") + "n" +  cls.text() + "n");

                                    System.out.println("link: " + cls.tagName());

                                    System.out.println("Course Number: " + link3.data().toString());



                                }

                            }

                        }

                    }

                }

            }

        }catch (IOException ioe) {

            ioe.printStackTrace();

        }

    }



    public static void main (String args) {



        new Scraper();

    }



}

org.jsoup.HttpStatusException: HTTP error fetching URL. Status=400, >URL=https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?>pi_search_type=SB_COURSE&_subj_code=AAA&pi_crse_numb=090&pi_archive_date=&pi_co>urse_status=A&pi_term_code=201920 at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:776 )at >org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:722) at >org.jsoup.helper.HttpConnection.execute(HttpConnection.java:306) at >org.jsoup.helper.HttpConnection.get(HttpConnection.java:295) at Scraper.(Scraper.java:42) at Scraper.main(Scraper.java:64)"

jsoup

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

edited Nov 27 '18 at 1:07

asked Nov 26 '18 at 23:52

bob wonch

asked Nov 26 '18 at 23:52

bob wonch

asked Nov 26 '18 at 23:52

bob wonch

I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07

add a comment |

I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07

I have copy pasted your original code and also get the desired output. Can you try again with a setted UserAgent, i.e. String userAgent ="Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko"; Jsoup.connect(someUrl).userAgent(userAgent).get(); ?

– Eritrean
Dec 5 '18 at 17:07

add a comment |

1 Answer
1

active

oldest

votes

I think you are following the path "Why make it simple if it can be complicated?". I just have a look at that website you are trying to scrap. The site is well structured and organized. So no need for concatenating strings, using regex and do all the scraping in the constructor. I would sugest following steps:

Get the first page and select the table with the prefix links from A to Z

For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999

Your code could be something like :

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class JsoupTest {

    final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"

                      + "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";



    public static void main(String args) {

        try {

            Document page = Jsoup.connect(homePage).get();

            Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



            for(Element prefix : prefixLinks){

                Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();

                Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



                for(Element coursePrefix : coursePrefixLinks){

                    Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();

                    Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");                    



                    for(Element course : courseLinks){

                        Document coursePage = Jsoup.connect(course.absUrl("href")).get();

                        try{

                            String courseNr = coursePage.select("tr:contains(Course:)").first().text();  

                            String courseTitle = coursePage.select("tr:contains(Title:)").first().text(); 

                            System.out.println(courseNr +"n" + courseTitle + "n********************");

                        }

                        catch(NullPointerException np){

                            System.out.println("Broken link, Page not Found");

                        }

                    }

                }

            }



        } catch (IOException ex) {

            ex.printStackTrace();

        }

    }

}

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53490821%2fjsoup-return-status-400-error-on-some-links-but-not-others-on-the-same-page%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

Get the first page and select the table with the prefix links from A to Z

For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999

Your code could be something like :

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class JsoupTest {

    final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"

                      + "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";



    public static void main(String args) {

        try {

            Document page = Jsoup.connect(homePage).get();

            Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



            for(Element prefix : prefixLinks){

                Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();

                Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



                for(Element coursePrefix : coursePrefixLinks){

                    Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();

                    Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");                    



                    for(Element course : courseLinks){

                        Document coursePage = Jsoup.connect(course.absUrl("href")).get();

                        try{

                            String courseNr = coursePage.select("tr:contains(Course:)").first().text();  

                            String courseTitle = coursePage.select("tr:contains(Title:)").first().text(); 

                            System.out.println(courseNr +"n" + courseTitle + "n********************");

                        }

                        catch(NullPointerException np){

                            System.out.println("Broken link, Page not Found");

                        }

                    }

                }

            }



        } catch (IOException ex) {

            ex.printStackTrace();

        }

    }

}

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

add a comment |

Get the first page and select the table with the prefix links from A to Z

For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999

Your code could be something like :

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class JsoupTest {

    final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"

                      + "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";



    public static void main(String args) {

        try {

            Document page = Jsoup.connect(homePage).get();

            Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



            for(Element prefix : prefixLinks){

                Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();

                Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



                for(Element coursePrefix : coursePrefixLinks){

                    Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();

                    Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");                    



                    for(Element course : courseLinks){

                        Document coursePage = Jsoup.connect(course.absUrl("href")).get();

                        try{

                            String courseNr = coursePage.select("tr:contains(Course:)").first().text();  

                            String courseTitle = coursePage.select("tr:contains(Title:)").first().text(); 

                            System.out.println(courseNr +"n" + courseTitle + "n********************");

                        }

                        catch(NullPointerException np){

                            System.out.println("Broken link, Page not Found");

                        }

                    }

                }

            }



        } catch (IOException ex) {

            ex.printStackTrace();

        }

    }

}

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

add a comment |

Get the first page and select the table with the prefix links from A to Z

For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999

Your code could be something like :

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class JsoupTest {

    final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"

                      + "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";



    public static void main(String args) {

        try {

            Document page = Jsoup.connect(homePage).get();

            Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



            for(Element prefix : prefixLinks){

                Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();

                Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



                for(Element coursePrefix : coursePrefixLinks){

                    Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();

                    Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");                    



                    for(Element course : courseLinks){

                        Document coursePage = Jsoup.connect(course.absUrl("href")).get();

                        try{

                            String courseNr = coursePage.select("tr:contains(Course:)").first().text();  

                            String courseTitle = coursePage.select("tr:contains(Title:)").first().text(); 

                            System.out.println(courseNr +"n" + courseTitle + "n********************");

                        }

                        catch(NullPointerException np){

                            System.out.println("Broken link, Page not Found");

                        }

                    }

                }

            }



        } catch (IOException ex) {

            ex.printStackTrace();

        }

    }

}

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

Get the first page and select the table with the prefix links from A to Z

For each link from 1. get the page of the prefix and select the table with the course prefix links. For example for the prefix A to get the links from AAA to AVT

For each course prefix link from 2. get the page of the course prefix and select the table containing the course links. For example for AAA to get the links from AAA 010 to AAA 999

Your code could be something like :

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class JsoupTest {

    final static String homePage = "https://erpdnssb.cccs.edu/PRODCCCS/ccns_pub_controller.p_command_processor?"

                      + "pi_course_status=A&pi_term_code=201920&pi_search_type=SB_PREFIX";



    public static void main(String args) {

        try {

            Document page = Jsoup.connect(homePage).get();

            Elements prefixLinks = page.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



            for(Element prefix : prefixLinks){

                Document prefixPage = Jsoup.connect(prefix.absUrl("href")).get();

                Elements coursePrefixLinks = prefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");



                for(Element coursePrefix : coursePrefixLinks){

                    Document coursePrefixPage = Jsoup.connect(coursePrefix.absUrl("href")).get();

                    Elements courseLinks = coursePrefixPage.select(".pagebodydiv > table:nth-child(6) tbody tr td a");                    



                    for(Element course : courseLinks){

                        Document coursePage = Jsoup.connect(course.absUrl("href")).get();

                        try{

                            String courseNr = coursePage.select("tr:contains(Course:)").first().text();  

                            String courseTitle = coursePage.select("tr:contains(Title:)").first().text(); 

                            System.out.println(courseNr +"n" + courseTitle + "n********************");

                        }

                        catch(NullPointerException np){

                            System.out.println("Broken link, Page not Found");

                        }

                    }

                }

            }



        } catch (IOException ex) {

            ex.printStackTrace();

        }

    }

}

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

edited Dec 4 '18 at 13:44

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

answered Dec 4 '18 at 11:12

Eritrean

3,5021914

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

add a comment |

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

Note this will take a while (>30 minutes) as you are trying to get data from 8925 pages.

– Eritrean
Dec 4 '18 at 14:07

Thanks for the input~! With this code, I get the same error on AAA 090...I tried something similar before and that's how I ended up with breaking up the URL and replacing the bits that are missing on the returned link.

– bob wonch
Dec 5 '18 at 16:14

I ended up fixing it via spliting the string and stplitting for a few specific cases.

– bob wonch
Dec 5 '18 at 16:15

I cant reproduce your issue. With the code above i get the desired output. The first broken link is for the course BIO 223 which get catched in the most innerest for loop.

– Eritrean
Dec 5 '18 at 16:38

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Btukfyl