From 5acf2437013ee9afea653bea9bbcbf3fe6865626 Mon Sep 17 00:00:00 2001 From: Date: Thu, 6 Jun 2019 21:19:05 +0900 Subject: [PATCH] Ranking of the URL's by tf idf added --- App.java | 184 ---------------------- Article.java | 63 -------- CAu_NLP_2019/.idea/workspace.xml | 151 ++++++++++++++---- CAu_NLP_2019/src/main/java/App.java | 153 +++++++++--------- CAu_NLP_2019/src/main/java/Article.java | 12 ++ CAu_NLP_2019/src/main/java/TFIDF.java | 46 ++++++ CAu_NLP_2019/target/classes/App.class | Bin 4418 -> 7163 bytes CAu_NLP_2019/target/classes/Article.class | Bin 1732 -> 1977 bytes CAu_NLP_2019/target/classes/TFIDF.class | Bin 0 -> 1815 bytes 9 files changed, 264 insertions(+), 345 deletions(-) delete mode 100644 App.java delete mode 100644 Article.java create mode 100644 CAu_NLP_2019/src/main/java/TFIDF.java create mode 100644 CAu_NLP_2019/target/classes/TFIDF.class diff --git a/App.java b/App.java deleted file mode 100644 index e69773a..0000000 --- a/App.java +++ /dev/null @@ -1,184 +0,0 @@ - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Scanner; -import java.util.StringTokenizer; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -public class App { - public static void main(String[] args) throws Exception { - String url[] = {"https://www.thesun.co.uk/?s=", "https://www.bbc.co.uk/search?q=", "https://www.skysports.com/search?q="}; - //ArrayList key = new ArrayList(); - Article data = new Article(); - //key.add("salah"); - //key.add("Liverpool"); - //key.add("Champions league"); - // 가져오고 싶은 정보가 있는 웹페이지의 url - Document doc = null; - Elements element = null; - - Scanner scanner = new Scanner(System.in); - System.out.print("Please type keywords : "); - String key[] = scanner.nextLine().split(","); - scanner.close(); - for (int j = 0; j < url.length; j++) { - for (int i = 0; i < key.length; i++) { - String urlTmp = url[j] + key[i]; - doc = Jsoup.connect(urlTmp).execute().parse(); // Document에 url 페이지의 데이터를 가져온다. - /* } catch (IOException e) { - e.printStackTrace(); - }*/ - if (url[j].equals("https://www.thesun.co.uk/?s=")) { - element = doc.select("div.search-results-wrap"); - for (Element el : element.select(".teaser-item")) { - if (el.select("p").text().toLowerCase().contains(key[i].toLowerCase())) { - data.setHeadline(el.select("p").text()); - Elements elUrl = el.select(".teaser__copy-container a"); - data.setUrl(elUrl.first().absUrl("href")); - String temp = el.select(".search-date").text(); - data.setDate(changeDate(temp)); - data.setSite("The Sun"); - - - } - } - } - if (url[j].equals("https://www.bbc.co.uk/search?q=")) { - element = doc.select("section.search-content"); - for (Element el : element.select("li[data-result-number]")) { - data.setHeadline(el.select("h1").select("a").text()); - Elements elUrl = el.select("a[href]"); - data.setUrl(elUrl.first().absUrl("href")); - String temp = el.select(".display-date").text(); - data.setDate(changeDate(temp)); - data.setSite("BBC"); - } - } - if (url[j].equals("https://www.skysports.com/search?q=")) { - element = doc.select("div.news-list"); - for (Element el : element.select("div.news-list__item")) { - data.setHeadline(el.select("h4").select("a").text()); - Elements elUrl = el.select("a[href]"); - data.setUrl(elUrl.first().absUrl("href")); - String temp = el.select(".label__timestamp").text(); - data.setDate(changeDate2(temp)); - data.setSite("SKYSPORTS"); - } - } - } - } - - String content = ""; - for(int i = 0; i < data.getHowManyData(); i++) { - content = ""; - doc = Jsoup.connect(data.getUrl(i)).execute().parse(); - if (data.getSite(i) == "The Sun") { - element = doc.select("div.article__content"); - for (Element el : element.select("p")) { - content += el.text(); - } - data.setContent(content); - } - else if (data.getSite(i) == "BBC") { - element = doc.select("div#story-body"); - for (Element el : element.select("p")) { - content += el.text(); - } - data.setContent(content); - } - else if (data.getSite(i) == "SKYSPORTS") { - element = doc.select("div.article__body"); - for (Element el : element.select("p")) { - if (!el.hasClass("widge-marketing__text")) { - content += el.text(); - } - } - data.setContent(content); - } - System.out.println(data.getDate(i)); - System.out.println(data.getHeadline(i)); - System.out.println(data.getUrl(i)); - System.out.println(data.getSite(i)); - System.out.println(data.getContent(i)); - } - - } - - public static int changeDate2(String date) { - date = date.substring(0, 2) + date.substring(2 + 1); - date = date.substring(0, 4) + date.substring(4 + 1); - String year = date.substring(4, 8); - String month = date.substring(2,4); - String day = date.substring(0, 2); - String fdate = year + month + day; - - int mydate = Integer.parseInt(fdate); - return mydate; - } - - public static int changeDate(String date) { - int formdate = 0; - String sp[] = date.split(" "); - formdate += Integer.parseInt(sp[2]) * 10000; - formdate += Integer.parseInt(sp[0]); - - switch (sp[1]) { - case "January": - case "Jan": - formdate += 100; - break; - case "February": - case "Feb": - formdate += 200; - break; - case "March": - case "Mar": - formdate += 300; - break; - case "April": - case "Apr": - formdate += 400; - break; - case "May": - formdate += 500; - break; - case "June": - case "Jun": - formdate += 600; - break; - case "July": - case "Jul": - formdate += 700; - break; - case "August": - case "Aug": - formdate += 800; - break; - case "September": - case "Sep": - formdate += 900; - break; - case "October": - case "Oct": - formdate += 1000; - break; - case "November": - case "Nov": - formdate += 1100; - break; - case "December": - case "Dec": - formdate += 1200; - break; - - } - - return formdate; - - } -} \ No newline at end of file diff --git a/Article.java b/Article.java deleted file mode 100644 index 1db8a3c..0000000 --- a/Article.java +++ /dev/null @@ -1,63 +0,0 @@ -import java.util.ArrayList; - -public class Article { - - public Article() { - date = new ArrayList(); - Headline = new ArrayList(); - Url = new ArrayList(); - Site = new ArrayList(); - Content = new ArrayList(); - } - - private ArrayList date; - private ArrayList Headline; - private ArrayList Url; - private ArrayList Site; - private ArrayList Content; - - public void setDate(int num) { - date.add(num); - } - - public void setHeadline(String head) { - Headline.add(head); - } - - public void setUrl(String url) { - Url.add(url); - } - - public void setSite(String site) { - Site.add(site); - } - - public void setContent(String content) { - Content.add(content); - } - - - public int getHowManyData() { - return Headline.size(); - } - - public int getDate(int num) { - return date.get(num); - } - - public String getHeadline(int num) { - return Headline.get(num); - } - - public String getUrl(int num) { - return Url.get(num); - } - - public String getSite(int num) { - return Site.get(num); - } - - public String getContent(int num) { - return Content.get(num); - } -} \ No newline at end of file diff --git a/CAu_NLP_2019/.idea/workspace.xml b/CAu_NLP_2019/.idea/workspace.xml index 77c97e5..0017d85 100644 --- a/CAu_NLP_2019/.idea/workspace.xml +++ b/CAu_NLP_2019/.idea/workspace.xml @@ -11,8 +11,7 @@