Автоматизированное чтение страниц на java
Как можно прочитать данные из сайта которые динамически связаны с библиотеками js, таких как например knockout.js. Уже мучаюсь два дня. Могли бы указать где и что я не так использовал, упустил или что я мог бы использовать чтобы решить данную проблему.
Пытался использовать jsoup но получаю страницу до выполнение js кода. Ниже приведу пример с сайтом который использует knockout js: public class TelegrambotApplication {
public static void main(String[] args){
String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";
try {
Document document= Jsoup.connect(url).get();
String html=document.html();
System.out.println(html);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
Пытался использовать WebDriver но натыкаюсь на капчу:
public class TelegrambotApplication {
public static void main(String[] args){
String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";
ChromeOptions options = new ChromeOptions();
options.setHeadless(true);
WebDriver driver = new ChromeDriver(options);
try {
driver.get(url);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(60L));
wait.until(ExpectedConditions.presenceOfElementLocated(By.id("sr_normal")));
String htmlContent = driver.getPageSource();
System.out.println(htmlContent);
} catch (Exception e) {
e.printStackTrace();
} finally {
driver.quit();
}
}
}
Также пытался взять данные из document.data() и запустить там js используя nashorn но выводит ошибки. Код:
public class TelegrambotApplication {
public static void main(String[] args){
String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";
try {
Document document= Jsoup.connect(url).get();
String data= document.data();
String html=document.html();
System.out.println(data);
String processedContent = executeJavaScript(html, data);
System.out.println(processedContent);
} catch (IOException | ScriptException | NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
private static String executeJavaScript(String pageContent, String jsCode) throws ScriptException, NoSuchMethodException {
ScriptEngine engine = new ScriptEngineManager().getEngineByName("nashorn");
engine.eval("var pageContent = " + pageContent);
engine.eval(jsCode);
Invocable invocable = (Invocable) engine;
String processedContent = (String) invocable.invokeFunction("getResult");
return processedContent;
}
}
Ошибки:
Exception in thread "main" java.lang.RuntimeException: javax.script.ScriptException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
^ in <eval> at line number 1 at column number 18
at com.intership.telegrambot.TelegrambotApplication.main(TelegrambotApplication.java:28)
Caused by: javax.script.ScriptException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
^ in <eval> at line number 1 at column number 18
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.throwAsScriptException(NashornScriptEngine.java:463)
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:530)
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:517)
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.evalImpl(NashornScriptEngine.java:395)
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.eval(NashornScriptEngine.java:151)
at java.scripting/javax.script.AbstractScriptEngine.eval(AbstractScriptEngine.java:262)
at com.intership.telegrambot.TelegrambotApplication.executeJavaScript(TelegrambotApplication.java:57)
at com.intership.telegrambot.TelegrambotApplication.main(TelegrambotApplication.java:23)
Caused by: org.openjdk.nashorn.internal.runtime.ParserException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
^
at org.openjdk.nashorn.internal.parser.AbstractParser.error(AbstractParser.java:293)
at org.openjdk.nashorn.internal.parser.AbstractParser.error(AbstractParser.java:278)
at org.openjdk.nashorn.internal.parser.Parser.unaryExpression(Parser.java:4418)
at org.openjdk.nashorn.internal.parser.Parser.expression(Parser.java:4568)
at org.openjdk.nashorn.internal.parser.Parser.conditionalExpression(Parser.java:4720)
at org.openjdk.nashorn.internal.parser.Parser.assignmentExpression(Parser.java:4659)
at org.openjdk.nashorn.internal.parser.Parser.variableDeclarationList(Parser.java:1619)
at org.openjdk.nashorn.internal.parser.Parser.variableStatement(Parser.java:1521)
at org.openjdk.nashorn.internal.parser.Parser.statement(Parser.java:1033)
at org.openjdk.nashorn.internal.parser.Parser.sourceElements(Parser.java:900)
at org.openjdk.nashorn.internal.parser.Parser.program(Parser.java:835)
at org.openjdk.nashorn.internal.parser.Parser.parse(Parser.java:322)
at org.openjdk.nashorn.internal.parser.Parser.parse(Parser.java:282)
at org.openjdk.nashorn.internal.runtime.Context.compile(Context.java:1484)
at org.openjdk.nashorn.internal.runtime.Context.compileScript(Context.java:1451)
at org.openjdk.nashorn.internal.runtime.Context.compileScript(Context.java:761)
at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:528)
... 6 more