diff --git a/scrape.sh b/scrape.sh new file mode 100755 index 0000000..f328165 --- /dev/null +++ b/scrape.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +if [ "$#" -ne 2 ]; then +echo "Usage: $0 START_PAGE END_PAGE" +exit 1 +fi + + +START_PAGE=$1 +END_PAGE=$2 + +PAGE_STEP=15 + +OUTFILE="results.csv" + +echo "URL,Year,Driven,Fuel,EngineVolume,EnginePower,Price" > "$OUTFILE" + +for ((page=START_PAGE; page<=END_PAGE; page+=PAGE_STEP)); do +PAGE_URL="https://www.autobazar.cz/inzeraty/?first=${page}" +echo "Processing page: ${PAGE_URL}" + +page_content=$(curl -s "$PAGE_URL" | tr '\n' ' ') + +links=$(echo "$page_content" | grep -oP '') + +while IFS= read -r link_line; do + relative_url=$(echo "$link_line" | grep -oP 'href="\/\/[^"]+' | cut -d'"' -f2) + FULL_URL="https:${relative_url}" + echo ">> Processing listing: ${FULL_URL}" + + # Downloaded html is encoded in windows-1250 + # -> gets re-encoded utf-8 + # Then all newlines get removed + ad_content=$(curl -s "$FULL_URL" | iconv -f WINDOWS-1250 -t UTF-8 | tr -d '\n') + + # It seems like a car name can contain classes with single quotes (') but also double (") + # -> Test them both + car_name=$(echo "$ad_content" | grep -oP '

\K[^<]+' | xargs) + second_car_name=$(echo "$ad_content" | grep -oP '

\K[^<]+' | xargs) + [ -z "$car_name" ] && car_name="$second_car_name" + + year=$(echo "$ad_content" | grep -oP '(?<=Rok výroby:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) + driven=$(echo "$ad_content" | grep -oP '(?<=Najeto:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) + fuel=$(echo "$ad_content" | grep -oP '(?<=Palivo:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) + engine_volume=$(echo "$ad_content" | grep -oP '(?<=Objem motoru:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) + engine_power=$(echo "$ad_content" | grep -oP '(?<=Výkon motoru:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) + price=$(echo "$ad_content" | grep -oP '(?<= ).*?(?=<\/b>)' | sed 's/ //g' | xargs) + + # Later used for filtering invalid data for easier search + [ -z "$car_name" ] && car_name="N/A" + [ -z "$year" ] && year="N/A" + [ -z "$driven" ] && driven="N/A" + [ -z "$fuel" ] && fuel="N/A" + [ -z "$engine_volume" ] && engine_volume="N/A" + [ -z "$engine_power" ] && engine_power="N/A" + [ -z "$price" ] && price="N/A" + + echo "\"${FULL_URL}\",\"${car_name}\",\"${year}\",\"${driven}\",\"${fuel}\",\"${engine_volume}\",\"${engine_power}\",\"${price}\"" >> "$OUTFILE" +done <<< "$links" + +done + +echo "Scraping completed. Results saved in ${OUTFILE}" +