@inproceedings{562fb909775e46f8a87d9761252b6687,
title = "Self-Repairing Data Scraping for Websites",
abstract = "Pre-processing and cleaning data from the web is challenging, as web pages are updated regularly. Whenever a monitored page changes its layout, existing processing pipelines break and must be adapted to the new design. We present an approach to scraping website data using Large Language Models (LLMs) to determine the location of the desired information and create a JavaScript path to the object in the Document Object Model of the HTML page. Our approach automatically detects when the path cannot be parsed anymore and repairs the path, continuously updating the scraping. Based on the example of the website kununu.com, our approach allows for consistent scraping and self-repair without overly impacting system performance. LLMs are only activated when an error in the pipeline is detected. In the future, we plan to expand this approach with multiple websites and data sources.",
keywords = "Mechatronics, System performance, Soft sensors, Large language models, Pipelines, Layout, Web pages, Maintenance engineering, Data models, Monitoring, Data Cleaning, Data Scraping, LLM",
author = "Samuel Zuehlke and Joel Nitu and Simone Sandler and Oliver Krauss and Andreas St{\"o}ckl",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 4th International Conference on Electrical, Computer, Communications and Mechatronics Engineering (ICECCME) ; Conference date: 04-11-2024 Through 06-11-2024",
year = "2024",
month = nov,
day = "6",
doi = "10.1109/ICECCME62383.2024.10796733",
language = "English",
isbn = "979-8-3503-9119-0",
series = "International Conference on Electrical, Computer, Communications and Mechatronics Engineering, ICECCME 2024",
publisher = "IEEE",
pages = "1--4",
booktitle = "International Conference on Electrical, Computer, Communications and Mechatronics Engineering, ICECCME 2024",
}