diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 1a55840..30dadd2 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -29,21 +29,18 @@
-
-
-
-
-
-
+
-
+
+
+
1559529262635
+
+
+
+
-
+
-
-
-
-
-
-
+
-
-
-
-
+
+
+
+
+
+
+
@@ -91,16 +90,24 @@
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/CAu_NLP_2019/.idea/misc.xml b/CAu_NLP_2019/.idea/misc.xml
index 2c4e090..4c7eb88 100644
--- a/CAu_NLP_2019/.idea/misc.xml
+++ b/CAu_NLP_2019/.idea/misc.xml
@@ -1,14 +1,17 @@
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/CAu_NLP_2019/.idea/workspace.xml b/CAu_NLP_2019/.idea/workspace.xml
index 0017d85..6419bae 100644
--- a/CAu_NLP_2019/.idea/workspace.xml
+++ b/CAu_NLP_2019/.idea/workspace.xml
@@ -1,7 +1,15 @@
-
+
+
+
+
+
+
+
+
+
@@ -15,20 +23,11 @@
-
-
-
-
-
-
-
-
-
-
-
+
+
@@ -36,10 +35,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
@@ -78,16 +105,37 @@
+
+
+ args
+ StaticLoggerBinder
+ changeDate
+ System.out.println
+ result
+ key
+
+
+ searchs
+ words
+
+
+
+
+
+
-
+
+
@@ -98,13 +146,14 @@
-
+
+
+
-
@@ -131,24 +180,27 @@
-
-
-
-
-
+
+
+
+
+
+
+
-
+
+
@@ -173,6 +225,9 @@
+
+
+
@@ -186,14 +241,22 @@
1559476475559
+
+
+
+
+
+
+
+
-
+
-
+
@@ -202,15 +265,17 @@
-
+
-
+
-
+
+
+
@@ -219,6 +284,7 @@
+
@@ -249,24 +315,30 @@
+
+
+
-
+
+
-
-
+
+
-
+
-
-
+
+
+
+
-
+
@@ -295,17 +367,34 @@
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
@@ -377,6 +466,7 @@
+ OpenJDK 11.0.2
diff --git a/CAu_NLP_2019/pom.xml b/CAu_NLP_2019/pom.xml
index f836009..fdc8ee1 100644
--- a/CAu_NLP_2019/pom.xml
+++ b/CAu_NLP_2019/pom.xml
@@ -4,24 +4,38 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
-
- org.jsoup
- jsoup
- 1.11.3
-
+
+
+ org.jsoup
+ jsoup
+ 1.11.3
+
+
+
+ com.sparkjava
+ spark-core
+ 2.8.0
+
+
+
+ com.google.code.gson
+ gson
+ 2.8.5
+
+
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.8.1
-
-
- 1.8
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+
+ 1.8
+
+
+
PQMAN
CAu_NLP_2019
diff --git a/CAu_NLP_2019/src/main/java/App.java b/CAu_NLP_2019/src/main/java/App.java
index a09ecf4..aae70b5 100644
--- a/CAu_NLP_2019/src/main/java/App.java
+++ b/CAu_NLP_2019/src/main/java/App.java
@@ -1,166 +1,154 @@
-
-
-import java.io.IOException;
-import java.text.DecimalFormat;
-import java.util.*;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
-public class App {
- public static void main(String[] args) throws Exception {
- String url[] = {"https://www.thesun.co.uk/?s=", "https://www.bbc.co.uk/search?q=", "https://www.skysports.com/search?q="};
-
- List> documents = new ArrayList<>();
- List> result = new ArrayList<>();
- List result2 = new ArrayList<>();
- String document;
- List goodUrl = new ArrayList<>();
- Article data = new Article();
-
- Document doc = null;
- Elements element = null;
-
- Scanner scanner = new Scanner(System.in);
- System.out.print("Please type keywords : ");
- String key[] = scanner.nextLine().split(",");
- scanner.close();
- for (int j = 0; j < url.length; j++) {
- for (int i = 0; i < key.length; i++) {
- String urlTmp = url[j] + key[i];
- doc = Jsoup.connect(urlTmp).execute().parse(); // Document에 url 페이지의 데이터를 가져온다.
- if (url[j].equals("https://www.thesun.co.uk/?s=")) {
- element = doc.select("div.search-results-wrap");
- for (Element el : element.select(".teaser-item")) {
- if (el.select("p").text().toLowerCase().contains(key[i].toLowerCase())) {
- data.setHeadline(el.select("p").text());
- Elements elUrl = el.select(".teaser__copy-container a");
- data.setUrl(elUrl.first().absUrl("href"));
- String temp = el.select(".search-date").text();
- data.setDate(changeDate(temp));
- data.setSite("The Sun");
-
-
- }
- }
- }
- if (url[j].equals("https://www.bbc.co.uk/search?q=")) {
- element = doc.select("section.search-content");
- for (Element el : element.select("li[data-result-number]")) {
- data.setHeadline(el.select("h1").select("a").text());
- Elements elUrl = el.select("a[href]");
- data.setUrl(elUrl.first().absUrl("href"));
- String temp = el.select(".display-date").text();
- data.setDate(changeDate(temp));
- data.setSite("BBC");
- }
- }
- if (url[j].equals("https://www.skysports.com/search?q=")) {
- element = doc.select("div.news-list");
- for (Element el : element.select("div.news-list__item")) {
- data.setHeadline(el.select("h4").select("a").text());
- Elements elUrl = el.select("a[href]");
- data.setUrl(elUrl.first().absUrl("href"));
- String temp = el.select(".label__timestamp").text();
- data.setDate(changeDate2(temp));
- data.setSite("SKYSPORTS");
- }
- }
- }
- }
-
- String content = "";
- for(int i = 0; i < data.getHowManyData(); i++) {
- content = "";
- try {
- doc = Jsoup.connect(data.getUrl(i)).execute().parse();
- if (data.getSite(i) == "The Sun") {
- element = doc.select("div.article__content");
- for (Element el : element.select("p")) {
- content += el.text();
- }
- data.setContent(content);
- } else if (data.getSite(i) == "BBC") {
- element = doc.select("div#story-body");
- for (Element el : element.select("p")) {
- content += el.text();
- }
- data.setContent(content);
- } else if (data.getSite(i) == "SKYSPORTS") {
- element = doc.select("div.article__body");
- for (Element el : element.select("p")) {
- if (!el.hasClass("widge-marketing__text")) {
- content += el.text();
- }
- }
- data.setContent(content);
- }
- if (data.getContent(i).length() > 0){
- /*
- System.out.println(data.getDate(i));
- System.out.println(data.getHeadline(i));
- System.out.println(data.getUrl(i));
- System.out.println(data.getSite(i));
- */
- document = data.getContent((i)).replaceAll("\\s+",",");
- goodUrl.add(data.getUrl(i));
- documents.add(new ArrayList(Arrays.asList(document.split(","))));
- }
- else
- System.out.println("No content: " + data.getUrl(i));
- }
- catch (Exception e) {
- System.out.println("Something went wrong.: " + e);
- }
- }
- for (int i = 0; i < documents.size(); i++){
- DecimalFormat df = new DecimalFormat("#.####");
- TFIDF calculator = new TFIDF();
- double tfidf = calculator.tfIdf(documents.get(i), documents, key[0]);
- result.add(Arrays.asList((df.format(tfidf)), goodUrl.get(i)));
- }
- for (int i = 0; i < result.size(); i++){
- System.out.println("TF-IDF: " + result.get(i).get(0) + " Url: " + result.get(i).get(1));
- }
- double max;
- int pos;
- while (result.size() > 0){
- pos = 0;
- max = Double.parseDouble(result.get(0).get(0));
- for (int i2 = 0; i2 < result.size(); i2++){
- if (Double.compare(max, Double.parseDouble(result.get(i2).get(0))) < 0){
- max = Double.parseDouble(result.get(i2).get(0));
- pos = i2;
- }
- }
- result2.add(result.get(pos).get(1));
- result.remove(pos);
- }
- for (int i = 0; i < result2.size(); i++){
- System.out.println("Url: " + result2.get(i));
- }
-
- }
-
- public static int changeDate2(String date) {
- date = date.substring(0, 2) + date.substring(2 + 1);
- date = date.substring(0, 4) + date.substring(4 + 1);
- String year = date.substring(4, 8);
- String month = date.substring(2,4);
- String day = date.substring(0, 2);
- String fdate = year + month + day;
-
- int mydate = Integer.parseInt(fdate);
- return mydate;
- }
-
- public static int changeDate(String date) {
- int formdate = 0;
- String sp[] = date.split(" ");
- formdate = 0;
- return formdate;
-
- }
+import com.google.gson.Gson;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static spark.Spark.get;
+import static spark.Spark.port;
+
+public class App {
+
+ public static void main(String[] args) {
+ port(8080);
+ get("/search", (req, res) -> {
+ Gson gson = new Gson();
+ String[] words = req.queryParams("s").split("/+");
+ ArrayList results = getResults(words);
+ return gson.toJson(results);
+ });
+ }
+
+ private static ArrayList getResults(String[] words) throws Exception {
+ String[] url = {"https://www.thesun.co.uk/?s=", "https://www.bbc.co.uk/search?q=", "https://www.skysports.com/search?q="};
+
+ List> documents = new ArrayList<>();
+ List> result = new ArrayList<>();
+ List result2 = new ArrayList<>();
+ String document;
+ List goodUrl = new ArrayList<>();
+ Article data = new Article();
+
+ ArrayList results = new ArrayList<>();
+ Document doc = null;
+ Elements element = null;
+
+ for (String s : url) {
+ for (String word : words) {
+ String urlTmp = s + word;
+ doc = Jsoup.connect(urlTmp).execute().parse(); // Document에 url 페이지의 데이터를 가져온다.
+ if (s.equals("https://www.thesun.co.uk/?s=")) {
+ element = doc.select("div.search-results-wrap");
+ for (Element el : element.select(".teaser-item")) {
+ if (el.select("p").text().toLowerCase().contains(word.toLowerCase())) {
+ data.setHeadline(el.select("p").text());
+ Elements elUrl = el.select(".teaser__copy-container a");
+ data.setUrl(elUrl.first().absUrl("href"));
+ String temp = el.select(".search-date").text();
+ data.setDate(changeDate(temp));
+ data.setSite("The Sun");
+
+
+ }
+ }
+ }
+ if (s.equals("https://www.bbc.co.uk/search?q=")) {
+ element = doc.select("section.search-content");
+ for (Element el : element.select("li[data-result-number]")) {
+ data.setHeadline(el.select("h1").select("a").text());
+ Elements elUrl = el.select("a[href]");
+ data.setUrl(elUrl.first().absUrl("href"));
+ String temp = el.select(".display-date").text();
+ data.setDate(changeDate(temp));
+ data.setSite("BBC");
+ }
+ }
+ if (s.equals("https://www.skysports.com/search?q=")) {
+ element = doc.select("div.news-list");
+ for (Element el : element.select("div.news-list__item")) {
+ data.setHeadline(el.select("h4").select("a").text());
+ Elements elUrl = el.select("a[href]");
+ data.setUrl(elUrl.first().absUrl("href"));
+ String temp = el.select(".label__timestamp").text();
+ data.setDate(changeDate2(temp));
+ data.setSite("SKYSPORTS");
+ }
+ }
+ }
+ }
+
+ StringBuilder content = new StringBuilder();
+ for (int i = 0; i < data.getHowManyData(); i++) {
+ content = new StringBuilder();
+ try {
+ doc = Jsoup.connect(data.getUrl(i)).execute().parse();
+ switch (data.getSite(i)) {
+ case "The Sun":
+ element = doc.select("div.article__content");
+ for (Element el : element.select("p")) {
+ content.append(el.text());
+ }
+ data.setContent(content.toString());
+ break;
+ case "BBC":
+ element = doc.select("div#story-body");
+ for (Element el : element.select("p")) {
+ content.append(el.text());
+ }
+ data.setContent(content.toString());
+ break;
+ case "SKYSPORTS":
+ element = doc.select("div.article__body");
+ for (Element el : element.select("p")) {
+ if (!el.hasClass("widge-marketing__text")) {
+ content.append(el.text());
+ }
+ }
+ data.setContent(content.toString());
+ break;
+ }
+ if (data.getContent(i).length() > 0) {
+ document = data.getContent((i)).replaceAll("\\s+", ",");
+ goodUrl.add(data.getUrl(i));
+ documents.add(new ArrayList(Arrays.asList(document.split(","))));
+ } else
+ System.out.println("No content: " + data.getUrl(i));
+ } catch (Exception e) {
+ System.out.println("Something went wrong.: " + e);
+ }
+ }
+ for (int i = 0; i < documents.size(); i++) {
+ DecimalFormat df = new DecimalFormat("#.####");
+ TFIDF calculator = new TFIDF();
+ double tfidf = calculator.tfIdf(documents.get(i), documents, words[0]);
+ results.add(new Result(df.format(tfidf), goodUrl.get(i)));
+ }
+ System.out.println("End Request");
+ return results;
+ }
+
+ private static int changeDate2(String date) {
+ date = date.substring(0, 2) + date.substring(2 + 1);
+ date = date.substring(0, 4) + date.substring(4 + 1);
+ String year = date.substring(4, 8);
+ String month = date.substring(2, 4);
+ String day = date.substring(0, 2);
+ String fdate = year + month + day;
+
+ int mydate = Integer.parseInt(fdate);
+ return mydate;
+ }
+
+ private static int changeDate(String date) {
+ int formdate = 0;
+ String[] sp = date.split(" ");
+ formdate = 0;
+ return formdate;
+ }
+
}
\ No newline at end of file
diff --git a/CAu_NLP_2019/src/main/java/Result.java b/CAu_NLP_2019/src/main/java/Result.java
new file mode 100644
index 0000000..89990eb
--- /dev/null
+++ b/CAu_NLP_2019/src/main/java/Result.java
@@ -0,0 +1,26 @@
+public class Result {
+
+ private String tfidf;
+ private String url;
+
+ public Result(String tfidf, String url) {
+ this.tfidf = tfidf;
+ this.url = url;
+ }
+
+ public String getTfidf() {
+ return tfidf;
+ }
+
+ public void setTfidf(String tfidf) {
+ this.tfidf = tfidf;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+}
diff --git a/CAu_NLP_2019/target/classes/App.class b/CAu_NLP_2019/target/classes/App.class
index fd32e94..fa234f0 100644
Binary files a/CAu_NLP_2019/target/classes/App.class and b/CAu_NLP_2019/target/classes/App.class differ