Ranking of the URL's by tf idf added
This commit is contained in:
parent
8494af6d6d
commit
5acf243701
184
App.java
184
App.java
@ -1,184 +0,0 @@
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Scanner;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
public class App {
|
||||
public static void main(String[] args) throws Exception {
|
||||
String url[] = {"https://www.thesun.co.uk/?s=", "https://www.bbc.co.uk/search?q=", "https://www.skysports.com/search?q="};
|
||||
//ArrayList<String> key = new ArrayList<String>();
|
||||
Article data = new Article();
|
||||
//key.add("salah");
|
||||
//key.add("Liverpool");
|
||||
//key.add("Champions league");
|
||||
// 가져오고 싶은 정보가 있는 웹페이지의 url
|
||||
Document doc = null;
|
||||
Elements element = null;
|
||||
|
||||
Scanner scanner = new Scanner(System.in);
|
||||
System.out.print("Please type keywords : ");
|
||||
String key[] = scanner.nextLine().split(",");
|
||||
scanner.close();
|
||||
for (int j = 0; j < url.length; j++) {
|
||||
for (int i = 0; i < key.length; i++) {
|
||||
String urlTmp = url[j] + key[i];
|
||||
doc = Jsoup.connect(urlTmp).execute().parse(); // Document에 url 페이지의 데이터를 가져온다.
|
||||
/* } catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}*/
|
||||
if (url[j].equals("https://www.thesun.co.uk/?s=")) {
|
||||
element = doc.select("div.search-results-wrap");
|
||||
for (Element el : element.select(".teaser-item")) {
|
||||
if (el.select("p").text().toLowerCase().contains(key[i].toLowerCase())) {
|
||||
data.setHeadline(el.select("p").text());
|
||||
Elements elUrl = el.select(".teaser__copy-container a");
|
||||
data.setUrl(elUrl.first().absUrl("href"));
|
||||
String temp = el.select(".search-date").text();
|
||||
data.setDate(changeDate(temp));
|
||||
data.setSite("The Sun");
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if (url[j].equals("https://www.bbc.co.uk/search?q=")) {
|
||||
element = doc.select("section.search-content");
|
||||
for (Element el : element.select("li[data-result-number]")) {
|
||||
data.setHeadline(el.select("h1").select("a").text());
|
||||
Elements elUrl = el.select("a[href]");
|
||||
data.setUrl(elUrl.first().absUrl("href"));
|
||||
String temp = el.select(".display-date").text();
|
||||
data.setDate(changeDate(temp));
|
||||
data.setSite("BBC");
|
||||
}
|
||||
}
|
||||
if (url[j].equals("https://www.skysports.com/search?q=")) {
|
||||
element = doc.select("div.news-list");
|
||||
for (Element el : element.select("div.news-list__item")) {
|
||||
data.setHeadline(el.select("h4").select("a").text());
|
||||
Elements elUrl = el.select("a[href]");
|
||||
data.setUrl(elUrl.first().absUrl("href"));
|
||||
String temp = el.select(".label__timestamp").text();
|
||||
data.setDate(changeDate2(temp));
|
||||
data.setSite("SKYSPORTS");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String content = "";
|
||||
for(int i = 0; i < data.getHowManyData(); i++) {
|
||||
content = "";
|
||||
doc = Jsoup.connect(data.getUrl(i)).execute().parse();
|
||||
if (data.getSite(i) == "The Sun") {
|
||||
element = doc.select("div.article__content");
|
||||
for (Element el : element.select("p")) {
|
||||
content += el.text();
|
||||
}
|
||||
data.setContent(content);
|
||||
}
|
||||
else if (data.getSite(i) == "BBC") {
|
||||
element = doc.select("div#story-body");
|
||||
for (Element el : element.select("p")) {
|
||||
content += el.text();
|
||||
}
|
||||
data.setContent(content);
|
||||
}
|
||||
else if (data.getSite(i) == "SKYSPORTS") {
|
||||
element = doc.select("div.article__body");
|
||||
for (Element el : element.select("p")) {
|
||||
if (!el.hasClass("widge-marketing__text")) {
|
||||
content += el.text();
|
||||
}
|
||||
}
|
||||
data.setContent(content);
|
||||
}
|
||||
System.out.println(data.getDate(i));
|
||||
System.out.println(data.getHeadline(i));
|
||||
System.out.println(data.getUrl(i));
|
||||
System.out.println(data.getSite(i));
|
||||
System.out.println(data.getContent(i));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static int changeDate2(String date) {
|
||||
date = date.substring(0, 2) + date.substring(2 + 1);
|
||||
date = date.substring(0, 4) + date.substring(4 + 1);
|
||||
String year = date.substring(4, 8);
|
||||
String month = date.substring(2,4);
|
||||
String day = date.substring(0, 2);
|
||||
String fdate = year + month + day;
|
||||
|
||||
int mydate = Integer.parseInt(fdate);
|
||||
return mydate;
|
||||
}
|
||||
|
||||
public static int changeDate(String date) {
|
||||
int formdate = 0;
|
||||
String sp[] = date.split(" ");
|
||||
formdate += Integer.parseInt(sp[2]) * 10000;
|
||||
formdate += Integer.parseInt(sp[0]);
|
||||
|
||||
switch (sp[1]) {
|
||||
case "January":
|
||||
case "Jan":
|
||||
formdate += 100;
|
||||
break;
|
||||
case "February":
|
||||
case "Feb":
|
||||
formdate += 200;
|
||||
break;
|
||||
case "March":
|
||||
case "Mar":
|
||||
formdate += 300;
|
||||
break;
|
||||
case "April":
|
||||
case "Apr":
|
||||
formdate += 400;
|
||||
break;
|
||||
case "May":
|
||||
formdate += 500;
|
||||
break;
|
||||
case "June":
|
||||
case "Jun":
|
||||
formdate += 600;
|
||||
break;
|
||||
case "July":
|
||||
case "Jul":
|
||||
formdate += 700;
|
||||
break;
|
||||
case "August":
|
||||
case "Aug":
|
||||
formdate += 800;
|
||||
break;
|
||||
case "September":
|
||||
case "Sep":
|
||||
formdate += 900;
|
||||
break;
|
||||
case "October":
|
||||
case "Oct":
|
||||
formdate += 1000;
|
||||
break;
|
||||
case "November":
|
||||
case "Nov":
|
||||
formdate += 1100;
|
||||
break;
|
||||
case "December":
|
||||
case "Dec":
|
||||
formdate += 1200;
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return formdate;
|
||||
|
||||
}
|
||||
}
|
63
Article.java
63
Article.java
@ -1,63 +0,0 @@
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class Article {
|
||||
|
||||
public Article() {
|
||||
date = new ArrayList<Integer>();
|
||||
Headline = new ArrayList<String>();
|
||||
Url = new ArrayList<String>();
|
||||
Site = new ArrayList<String>();
|
||||
Content = new ArrayList<String>();
|
||||
}
|
||||
|
||||
private ArrayList<Integer> date;
|
||||
private ArrayList<String> Headline;
|
||||
private ArrayList<String> Url;
|
||||
private ArrayList<String> Site;
|
||||
private ArrayList<String> Content;
|
||||
|
||||
public void setDate(int num) {
|
||||
date.add(num);
|
||||
}
|
||||
|
||||
public void setHeadline(String head) {
|
||||
Headline.add(head);
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
Url.add(url);
|
||||
}
|
||||
|
||||
public void setSite(String site) {
|
||||
Site.add(site);
|
||||
}
|
||||
|
||||
public void setContent(String content) {
|
||||
Content.add(content);
|
||||
}
|
||||
|
||||
|
||||
public int getHowManyData() {
|
||||
return Headline.size();
|
||||
}
|
||||
|
||||
public int getDate(int num) {
|
||||
return date.get(num);
|
||||
}
|
||||
|
||||
public String getHeadline(int num) {
|
||||
return Headline.get(num);
|
||||
}
|
||||
|
||||
public String getUrl(int num) {
|
||||
return Url.get(num);
|
||||
}
|
||||
|
||||
public String getSite(int num) {
|
||||
return Site.get(num);
|
||||
}
|
||||
|
||||
public String getContent(int num) {
|
||||
return Content.get(num);
|
||||
}
|
||||
}
|
@ -11,8 +11,7 @@
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="DefaultGradleProjectSettings">
|
||||
<option name="testRunner" value="GRADLE" />
|
||||
<option name="delegatedBuild" value="true" />
|
||||
<option name="isMigrated" value="true" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
@ -20,16 +19,16 @@
|
||||
<entry file="file://$PROJECT_DIR$/pom.xml">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="102">
|
||||
<caret line="6" column="16" lean-forward="true" selection-start-line="6" selection-start-column="16" selection-end-line="6" selection-end-column="16" />
|
||||
<caret line="6" column="16" selection-start-line="6" selection-start-column="16" selection-end-line="6" selection-end-column="16" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/App.java">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="2111">
|
||||
<caret line="150" lean-forward="true" selection-start-line="150" selection-end-line="154" selection-end-column="1" />
|
||||
<state relative-caret-position="156">
|
||||
<caret line="16" column="54" lean-forward="true" selection-start-line="16" selection-start-column="54" selection-end-line="16" selection-end-column="54" />
|
||||
<folding>
|
||||
<element signature="imports" expanded="true" />
|
||||
</folding>
|
||||
@ -37,9 +36,37 @@
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<file pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/Article.java">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="493">
|
||||
<caret line="59" column="28" selection-start-line="59" selection-start-column="18" selection-end-line="59" selection-end-column="28" />
|
||||
<folding>
|
||||
<element signature="e#511#512#0" expanded="true" />
|
||||
<element signature="e#540#541#0" expanded="true" />
|
||||
<element signature="e#584#585#0" expanded="true" />
|
||||
<element signature="e#618#619#0" expanded="true" />
|
||||
<element signature="e#656#657#0" expanded="true" />
|
||||
<element signature="e#684#685#0" expanded="true" />
|
||||
<element signature="e#724#725#0" expanded="true" />
|
||||
<element signature="e#754#755#0" expanded="true" />
|
||||
<element signature="e#800#801#0" expanded="true" />
|
||||
<element signature="e#836#837#0" expanded="true" />
|
||||
<element signature="e#872#873#0" expanded="true" />
|
||||
<element signature="e#910#911#0" expanded="true" />
|
||||
<element signature="e#945#946#0" expanded="true" />
|
||||
<element signature="e#981#982#0" expanded="true" />
|
||||
<element signature="e#1023#1024#0" expanded="true" />
|
||||
<element signature="e#1063#1064#0" expanded="true" />
|
||||
<element signature="e#1100#1101#0" expanded="true" />
|
||||
<element signature="e#1135#1136#0" expanded="true" />
|
||||
<element signature="e#1173#1174#0" expanded="true" />
|
||||
<element signature="e#1209#1210#0" expanded="true" />
|
||||
<element signature="e#1250#1251#0" expanded="true" />
|
||||
<element signature="e#1289#1290#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
@ -57,9 +84,10 @@
|
||||
<component name="IdeDocumentHistory">
|
||||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/src/main/java/Article.java" />
|
||||
<option value="$PROJECT_DIR$/CAu_NLP_2019.iml" />
|
||||
<option value="$PROJECT_DIR$/pom.xml" />
|
||||
<option value="$PROJECT_DIR$/src/main/java/Article.java" />
|
||||
<option value="$PROJECT_DIR$/src/main/java/tfidf.java" />
|
||||
<option value="$PROJECT_DIR$/src/main/java/App.java" />
|
||||
</list>
|
||||
</option>
|
||||
@ -76,9 +104,42 @@
|
||||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scope" />
|
||||
<pane id="PackagesPane" />
|
||||
<pane id="ProjectPane" />
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
<path>
|
||||
<item name="CAu_NLP_2019" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="CAu_NLP_2019" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="CAu_NLP_2019" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="CAu_NLP_2019" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="CAu_NLP_2019" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="CAu_NLP_2019" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="main" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="CAu_NLP_2019" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="CAu_NLP_2019" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="main" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="java" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="CAu_NLP_2019" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="CAu_NLP_2019" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="target" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
@ -130,8 +191,9 @@
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="-8" y="-8" width="1936" height="1096" extended-state="6" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.18017058" />
|
||||
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.18176973" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Image Layers" order="2" />
|
||||
<window_info id="Designer" order="3" />
|
||||
@ -140,15 +202,15 @@
|
||||
<window_info id="Favorites" order="6" side_tool="true" />
|
||||
<window_info anchor="bottom" id="Message" order="0" />
|
||||
<window_info anchor="bottom" id="Find" order="1" />
|
||||
<window_info anchor="bottom" id="Run" order="2" sideWeight="0.4989339" weight="0.32917964" />
|
||||
<window_info active="true" anchor="bottom" id="Run" order="2" sideWeight="0.49520257" visible="true" weight="0.32917964" />
|
||||
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
|
||||
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="6" />
|
||||
<window_info anchor="bottom" id="Terminal" order="7" sideWeight="0.49946696" weight="0.32917964" />
|
||||
<window_info anchor="bottom" id="Event Log" order="8" sideWeight="0.5010661" side_tool="true" visible="true" weight="0.32917964" />
|
||||
<window_info anchor="bottom" id="Event Log" order="8" sideWeight="0.50479746" side_tool="true" visible="true" weight="0.32917964" />
|
||||
<window_info anchor="bottom" id="Version Control" order="9" />
|
||||
<window_info anchor="bottom" id="Messages" order="10" weight="0.32917964" />
|
||||
<window_info anchor="bottom" id="Messages" order="10" sideWeight="0.4978678" weight="0.32917964" />
|
||||
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
|
||||
@ -188,13 +250,6 @@
|
||||
</layout-to-restore>
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/pom.xml">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="102">
|
||||
<caret line="6" column="16" lean-forward="true" selection-start-line="6" selection-start-column="16" selection-end-line="6" selection-end-column="16" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/CAu_NLP_2019.iml">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="17">
|
||||
@ -202,19 +257,61 @@
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/pom.xml">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="102">
|
||||
<caret line="6" column="16" selection-start-line="6" selection-start-column="16" selection-end-line="6" selection-end-column="16" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/Article.java">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="493">
|
||||
<caret line="59" column="28" selection-start-line="59" selection-start-column="18" selection-end-line="59" selection-end-column="28" />
|
||||
<folding>
|
||||
<element signature="e#511#512#0" expanded="true" />
|
||||
<element signature="e#540#541#0" expanded="true" />
|
||||
<element signature="e#584#585#0" expanded="true" />
|
||||
<element signature="e#618#619#0" expanded="true" />
|
||||
<element signature="e#656#657#0" expanded="true" />
|
||||
<element signature="e#684#685#0" expanded="true" />
|
||||
<element signature="e#724#725#0" expanded="true" />
|
||||
<element signature="e#754#755#0" expanded="true" />
|
||||
<element signature="e#800#801#0" expanded="true" />
|
||||
<element signature="e#836#837#0" expanded="true" />
|
||||
<element signature="e#872#873#0" expanded="true" />
|
||||
<element signature="e#910#911#0" expanded="true" />
|
||||
<element signature="e#945#946#0" expanded="true" />
|
||||
<element signature="e#981#982#0" expanded="true" />
|
||||
<element signature="e#1023#1024#0" expanded="true" />
|
||||
<element signature="e#1063#1064#0" expanded="true" />
|
||||
<element signature="e#1100#1101#0" expanded="true" />
|
||||
<element signature="e#1135#1136#0" expanded="true" />
|
||||
<element signature="e#1173#1174#0" expanded="true" />
|
||||
<element signature="e#1209#1210#0" expanded="true" />
|
||||
<element signature="e#1250#1251#0" expanded="true" />
|
||||
<element signature="e#1289#1290#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/TFIDF.java">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="357">
|
||||
<caret line="21" column="7" lean-forward="true" selection-start-line="21" selection-start-column="7" selection-end-line="21" selection-end-column="7" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/App.java">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="2111">
|
||||
<caret line="150" lean-forward="true" selection-start-line="150" selection-end-line="154" selection-end-column="1" />
|
||||
<state relative-caret-position="156">
|
||||
<caret line="16" column="54" lean-forward="true" selection-start-line="16" selection-start-column="54" selection-end-line="16" selection-end-column="54" />
|
||||
<folding>
|
||||
<element signature="imports" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/main/java/Article.java">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
</component>
|
||||
<component name="masterDetails">
|
||||
<states>
|
||||
|
@ -1,9 +1,8 @@
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Scanner;
|
||||
import java.util.StringTokenizer;
|
||||
import java.text.DecimalFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -13,11 +12,14 @@ import org.jsoup.select.Elements;
|
||||
public class App {
|
||||
public static void main(String[] args) throws Exception {
|
||||
String url[] = {"https://www.thesun.co.uk/?s=", "https://www.bbc.co.uk/search?q=", "https://www.skysports.com/search?q="};
|
||||
//ArrayList<String> key = new ArrayList<String>();
|
||||
|
||||
List<List<String>> documents = new ArrayList<>();
|
||||
List<List<String>> result = new ArrayList<>();
|
||||
List<String> result2 = new ArrayList<>();
|
||||
String document;
|
||||
List<String> goodUrl = new ArrayList<>();
|
||||
Article data = new Article();
|
||||
//key.add("salah");
|
||||
//key.add("Liverpool");
|
||||
//key.add("Champions league");
|
||||
|
||||
Document doc = null;
|
||||
Elements element = null;
|
||||
|
||||
@ -28,10 +30,7 @@ public class App {
|
||||
for (int j = 0; j < url.length; j++) {
|
||||
for (int i = 0; i < key.length; i++) {
|
||||
String urlTmp = url[j] + key[i];
|
||||
doc = Jsoup.connect(urlTmp).execute().parse();
|
||||
/* } catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}*/
|
||||
doc = Jsoup.connect(urlTmp).execute().parse(); // Document에 url 페이지의 데이터를 가져온다.
|
||||
if (url[j].equals("https://www.thesun.co.uk/?s=")) {
|
||||
element = doc.select("div.search-results-wrap");
|
||||
for (Element el : element.select(".teaser-item")) {
|
||||
@ -42,6 +41,8 @@ public class App {
|
||||
String temp = el.select(".search-date").text();
|
||||
data.setDate(changeDate(temp));
|
||||
data.setSite("The Sun");
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -70,12 +71,75 @@ public class App {
|
||||
}
|
||||
}
|
||||
|
||||
String content = "";
|
||||
for(int i = 0; i < data.getHowManyData(); i++) {
|
||||
content = "";
|
||||
try {
|
||||
doc = Jsoup.connect(data.getUrl(i)).execute().parse();
|
||||
if (data.getSite(i) == "The Sun") {
|
||||
element = doc.select("div.article__content");
|
||||
for (Element el : element.select("p")) {
|
||||
content += el.text();
|
||||
}
|
||||
data.setContent(content);
|
||||
} else if (data.getSite(i) == "BBC") {
|
||||
element = doc.select("div#story-body");
|
||||
for (Element el : element.select("p")) {
|
||||
content += el.text();
|
||||
}
|
||||
data.setContent(content);
|
||||
} else if (data.getSite(i) == "SKYSPORTS") {
|
||||
element = doc.select("div.article__body");
|
||||
for (Element el : element.select("p")) {
|
||||
if (!el.hasClass("widge-marketing__text")) {
|
||||
content += el.text();
|
||||
}
|
||||
}
|
||||
data.setContent(content);
|
||||
}
|
||||
if (data.getContent(i).length() > 0){
|
||||
/*
|
||||
System.out.println(data.getDate(i));
|
||||
System.out.println(data.getHeadline(i));
|
||||
System.out.println(data.getUrl(i));
|
||||
System.out.println(data.getSite(i));
|
||||
|
||||
*/
|
||||
document = data.getContent((i)).replaceAll("\\s+",",");
|
||||
goodUrl.add(data.getUrl(i));
|
||||
documents.add(new ArrayList<String>(Arrays.asList(document.split(","))));
|
||||
}
|
||||
else
|
||||
System.out.println("No content: " + data.getUrl(i));
|
||||
}
|
||||
catch (Exception e) {
|
||||
System.out.println("Something went wrong.: " + e);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < documents.size(); i++){
|
||||
DecimalFormat df = new DecimalFormat("#.####");
|
||||
TFIDF calculator = new TFIDF();
|
||||
double tfidf = calculator.tfIdf(documents.get(i), documents, key[0]);
|
||||
result.add(Arrays.asList((df.format(tfidf)), goodUrl.get(i)));
|
||||
}
|
||||
for (int i = 0; i < result.size(); i++){
|
||||
System.out.println("TF-IDF: " + result.get(i).get(0) + " Url: " + result.get(i).get(1));
|
||||
}
|
||||
double max;
|
||||
int pos;
|
||||
while (result.size() > 0){
|
||||
pos = 0;
|
||||
max = Double.parseDouble(result.get(0).get(0));
|
||||
for (int i2 = 0; i2 < result.size(); i2++){
|
||||
if (Double.compare(max, Double.parseDouble(result.get(i2).get(0))) < 0){
|
||||
max = Double.parseDouble(result.get(i2).get(0));
|
||||
pos = i2;
|
||||
}
|
||||
}
|
||||
result2.add(result.get(pos).get(1));
|
||||
result.remove(pos);
|
||||
}
|
||||
for (int i = 0; i < result2.size(); i++){
|
||||
System.out.println("Url: " + result2.get(i));
|
||||
}
|
||||
|
||||
}
|
||||
@ -94,61 +158,8 @@ public class App {
|
||||
|
||||
public static int changeDate(String date) {
|
||||
int formdate = 0;
|
||||
String sp[] = date.split(" ");/*
|
||||
formdate += Integer.parseInt(sp[2]) * 10000;
|
||||
formdate += Integer.parseInt(sp[0]);
|
||||
|
||||
switch (sp[1]) {
|
||||
case "January":
|
||||
case "Jan":
|
||||
formdate += 100;
|
||||
break;
|
||||
case "February":
|
||||
case "Feb":
|
||||
formdate += 200;
|
||||
break;
|
||||
case "March":
|
||||
case "Mar":
|
||||
formdate += 300;
|
||||
break;
|
||||
case "April":
|
||||
case "Apr":
|
||||
formdate += 400;
|
||||
break;
|
||||
case "May":
|
||||
formdate += 500;
|
||||
break;
|
||||
case "June":
|
||||
case "Jun":
|
||||
formdate += 600;
|
||||
break;
|
||||
case "July":
|
||||
case "Jul":
|
||||
formdate += 700;
|
||||
break;
|
||||
case "August":
|
||||
case "Aug":
|
||||
formdate += 800;
|
||||
break;
|
||||
case "September":
|
||||
case "Sep":
|
||||
formdate += 900;
|
||||
break;
|
||||
case "October":
|
||||
case "Oct":
|
||||
formdate += 1000;
|
||||
break;
|
||||
case "November":
|
||||
case "Nov":
|
||||
formdate += 1100;
|
||||
break;
|
||||
case "December":
|
||||
case "Dec":
|
||||
formdate += 1200;
|
||||
break;
|
||||
|
||||
}*/
|
||||
|
||||
String sp[] = date.split(" ");
|
||||
formdate = 0;
|
||||
return formdate;
|
||||
|
||||
}
|
||||
|
@ -7,12 +7,14 @@ public class Article {
|
||||
Headline = new ArrayList<String>();
|
||||
Url = new ArrayList<String>();
|
||||
Site = new ArrayList<String>();
|
||||
Content = new ArrayList<String>();
|
||||
}
|
||||
|
||||
private ArrayList<Integer> date;
|
||||
private ArrayList<String> Headline;
|
||||
private ArrayList<String> Url;
|
||||
private ArrayList<String> Site;
|
||||
private ArrayList<String> Content;
|
||||
|
||||
public void setDate(int num) {
|
||||
date.add(num);
|
||||
@ -30,6 +32,11 @@ public class Article {
|
||||
Site.add(site);
|
||||
}
|
||||
|
||||
public void setContent(String content) {
|
||||
Content.add(content);
|
||||
}
|
||||
|
||||
|
||||
public int getHowManyData() {
|
||||
return Headline.size();
|
||||
}
|
||||
@ -45,7 +52,12 @@ public class Article {
|
||||
public String getUrl(int num) {
|
||||
return Url.get(num);
|
||||
}
|
||||
|
||||
public String getSite(int num) {
|
||||
return Site.get(num);
|
||||
}
|
||||
|
||||
public String getContent(int num) {
|
||||
return Content.get(num);
|
||||
}
|
||||
}
|
46
CAu_NLP_2019/src/main/java/TFIDF.java
Normal file
46
CAu_NLP_2019/src/main/java/TFIDF.java
Normal file
@ -0,0 +1,46 @@
|
||||
import java.util.List;
|
||||
|
||||
public class TFIDF {
|
||||
/**
|
||||
* @param doc list of strings
|
||||
* @param term String represents a term
|
||||
* @return term frequency of term in document
|
||||
*/
|
||||
public double tf(List<String> doc, String term) {
|
||||
double result = 0;
|
||||
for (String word : doc) {
|
||||
if (term.equalsIgnoreCase(word))
|
||||
result++;
|
||||
}
|
||||
return result / doc.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param docs list of list of strings represents the dataset
|
||||
* @param term String represents a term
|
||||
* @return the inverse term frequency of term in documents
|
||||
*/
|
||||
public double idf(List<List<String>> docs, String term) {
|
||||
double n = 0;
|
||||
for (List<String> doc : docs) {
|
||||
for (String word : doc) {
|
||||
if (term.equalsIgnoreCase(word)) {
|
||||
n++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Math.log(docs.size() / n);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param doc a text document
|
||||
* @param docs all documents
|
||||
* @param term term
|
||||
* @return the TF-IDF of term
|
||||
*/
|
||||
public double tfIdf(List<String> doc, List<List<String>> docs, String term) {
|
||||
return tf(doc, term) * idf(docs, term);
|
||||
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
BIN
CAu_NLP_2019/target/classes/TFIDF.class
Normal file
BIN
CAu_NLP_2019/target/classes/TFIDF.class
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user