Автоматизированное чтение страниц на java

Как можно прочитать данные из сайта которые динамически связаны с библиотеками js, таких как например knockout.js. Уже мучаюсь два дня. Могли бы указать где и что я не так использовал, упустил или что я мог бы использовать чтобы решить данную проблему.

Пытался использовать jsoup но получаю страницу до выполнение js кода. Ниже приведу пример с сайтом который использует knockout js: public class TelegrambotApplication {

public static void main(String[] args){
        String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";

        try {
            Document document= Jsoup.connect(url).get();
            String html=document.html();

            System.out.println(html);

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

Пытался использовать WebDriver но натыкаюсь на капчу:

public class TelegrambotApplication {

    public static void main(String[] args){
        String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";

        ChromeOptions options = new ChromeOptions();
        options.setHeadless(true);

        WebDriver driver = new ChromeDriver(options);

        try {

            driver.get(url);

            WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(60L));

            wait.until(ExpectedConditions.presenceOfElementLocated(By.id("sr_normal")));

            String htmlContent = driver.getPageSource();


            System.out.println(htmlContent);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            driver.quit();
        }
    }
}

Также пытался взять данные из document.data() и запустить там js используя nashorn но выводит ошибки. Код:

public class TelegrambotApplication {

    public static void main(String[] args){
        String url="http://www.encar.com/dc/dc_carsearchlist.do?carType=kor#!%7B%22action%22%3A%22(And.Year.range(202100..)._.Mileage.range(..20000)._.Hidden.N._.(C.CarType.Y._.(C.Manufacturer.%EA%B8%B0%EC%95%84._.(C.ModelGroup.%EC%B9%B4%EB%8B%88%EB%B0%9C._.(C.Model.%EC%B9%B4%EB%8B%88%EB%B0%9C%204%EC%84%B8%EB%8C%80._.(C.BadgeGroup.%EB%94%94%EC%A0%A4%202200cc._.Badge.9%EC%9D%B8%EC%8A%B9%20%EC%8B%9C%EA%B7%B8%EB%8B%88%EC%B2%98.)))))_.Color.%ED%9D%B0%EC%83%89._.Options.%EC%84%A0%EB%A3%A8%ED%94%84.)%22%2C%22toggle%22%3A%7B%7D%2C%22layer%22%3A%22%22%2C%22sort%22%3A%22ModifiedDate%22%2C%22page%22%3A1%2C%22limit%22%3A20%2C%22searchKey%22%3A%22%22%2C%22loginCheck%22%3Afalse%7D";

        try {
            Document document= Jsoup.connect(url).get();
            String data= document.data();
            String html=document.html();

            System.out.println(data);
            String processedContent = executeJavaScript(html, data);

            System.out.println(processedContent);

        } catch (IOException | ScriptException | NoSuchMethodException e) {
            throw new RuntimeException(e);
        }
    }
    private static String executeJavaScript(String pageContent, String jsCode) throws ScriptException, NoSuchMethodException {
        ScriptEngine engine = new ScriptEngineManager().getEngineByName("nashorn");

        engine.eval("var pageContent = " + pageContent);

        engine.eval(jsCode);

        Invocable invocable = (Invocable) engine;
        String processedContent = (String) invocable.invokeFunction("getResult");

        return processedContent;
    }
}

Ошибки:

Exception in thread "main" java.lang.RuntimeException: javax.script.ScriptException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                  ^ in <eval> at line number 1 at column number 18
    at com.intership.telegrambot.TelegrambotApplication.main(TelegrambotApplication.java:28)
Caused by: javax.script.ScriptException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                  ^ in <eval> at line number 1 at column number 18
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.throwAsScriptException(NashornScriptEngine.java:463)
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:530)
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:517)
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.evalImpl(NashornScriptEngine.java:395)
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.eval(NashornScriptEngine.java:151)
    at java.scripting/javax.script.AbstractScriptEngine.eval(AbstractScriptEngine.java:262)
    at com.intership.telegrambot.TelegrambotApplication.executeJavaScript(TelegrambotApplication.java:57)
    at com.intership.telegrambot.TelegrambotApplication.main(TelegrambotApplication.java:23)
Caused by: org.openjdk.nashorn.internal.runtime.ParserException: <eval>:1:18 Expected an operand but found <
var pageContent = <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                  ^
    at org.openjdk.nashorn.internal.parser.AbstractParser.error(AbstractParser.java:293)
    at org.openjdk.nashorn.internal.parser.AbstractParser.error(AbstractParser.java:278)
    at org.openjdk.nashorn.internal.parser.Parser.unaryExpression(Parser.java:4418)
    at org.openjdk.nashorn.internal.parser.Parser.expression(Parser.java:4568)
    at org.openjdk.nashorn.internal.parser.Parser.conditionalExpression(Parser.java:4720)
    at org.openjdk.nashorn.internal.parser.Parser.assignmentExpression(Parser.java:4659)
    at org.openjdk.nashorn.internal.parser.Parser.variableDeclarationList(Parser.java:1619)
    at org.openjdk.nashorn.internal.parser.Parser.variableStatement(Parser.java:1521)
    at org.openjdk.nashorn.internal.parser.Parser.statement(Parser.java:1033)
    at org.openjdk.nashorn.internal.parser.Parser.sourceElements(Parser.java:900)
    at org.openjdk.nashorn.internal.parser.Parser.program(Parser.java:835)
    at org.openjdk.nashorn.internal.parser.Parser.parse(Parser.java:322)
    at org.openjdk.nashorn.internal.parser.Parser.parse(Parser.java:282)
    at org.openjdk.nashorn.internal.runtime.Context.compile(Context.java:1484)
    at org.openjdk.nashorn.internal.runtime.Context.compileScript(Context.java:1451)
    at org.openjdk.nashorn.internal.runtime.Context.compileScript(Context.java:761)
    at org.openjdk.nashorn.api.scripting.NashornScriptEngine.compileImpl(NashornScriptEngine.java:528)
    ... 6 more

Ответы (0 шт):