#!/bin/bash if [ "$#" -ne 2 ]; then echo "Usage: $0 START_PAGE END_PAGE" exit 1 fi START_PAGE=$1 END_PAGE=$2 PAGE_STEP=15 OUTFILE="results.csv" echo "URL,Year,Driven,Fuel,EngineVolume,EnginePower,Price" > "$OUTFILE" for ((page=START_PAGE; page<=END_PAGE; page+=PAGE_STEP)); do PAGE_URL="https://www.autobazar.cz/inzeraty/?first=${page}" echo "Processing page: ${PAGE_URL}" page_content=$(curl -s "$PAGE_URL" | tr '\n' ' ') links=$(echo "$page_content" | grep -oP '') while IFS= read -r link_line; do relative_url=$(echo "$link_line" | grep -oP 'href="\/\/[^"]+' | cut -d'"' -f2) FULL_URL="https:${relative_url}" echo ">> Processing listing: ${FULL_URL}" # Downloaded html is encoded in windows-1250 # -> gets re-encoded utf-8 # Then all newlines get removed ad_content=$(curl -s "$FULL_URL" | iconv -f WINDOWS-1250 -t UTF-8 | tr -d '\n') # It seems like a car name can contain classes with single quotes (') but also double (") # -> Test them both car_name=$(echo "$ad_content" | grep -oP '

\K[^<]+' | xargs) second_car_name=$(echo "$ad_content" | grep -oP '

\K[^<]+' | xargs) [ -z "$car_name" ] && car_name="$second_car_name" year=$(echo "$ad_content" | grep -oP '(?<=Rok výroby:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) driven=$(echo "$ad_content" | grep -oP '(?<=Najeto:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) fuel=$(echo "$ad_content" | grep -oP '(?<=Palivo:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) engine_volume=$(echo "$ad_content" | grep -oP '(?<=Objem motoru:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) engine_power=$(echo "$ad_content" | grep -oP '(?<=Výkon motoru:).*?(?=)' | sed 's/<[^>]*>//g' | xargs) price=$(echo "$ad_content" | grep -oP '(?<= ).*?(?=<\/b>)' | sed 's/ //g' | xargs) # Later used for filtering invalid data for easier search [ -z "$car_name" ] && car_name="N/A" [ -z "$year" ] && year="N/A" [ -z "$driven" ] && driven="N/A" [ -z "$fuel" ] && fuel="N/A" [ -z "$engine_volume" ] && engine_volume="N/A" [ -z "$engine_power" ] && engine_power="N/A" [ -z "$price" ] && price="N/A" echo "\"${FULL_URL}\",\"${car_name}\",\"${year}\",\"${driven}\",\"${fuel}\",\"${engine_volume}\",\"${engine_power}\",\"${price}\"" >> "$OUTFILE" done <<< "$links" done echo "Scraping completed. Results saved in ${OUTFILE}"