Added scrape script

This commit is contained in:
Thastertyn 2025-03-09 21:07:58 +01:00
parent 85815a81b5
commit c753acb6f5

64
scrape.sh Executable file
View File

@ -0,0 +1,64 @@
#!/bin/bash
if [ "$#" -ne 2 ]; then
echo "Usage: $0 START_PAGE END_PAGE"
exit 1
fi
START_PAGE=$1
END_PAGE=$2
PAGE_STEP=15
OUTFILE="results.csv"
echo "URL,Year,Driven,Fuel,EngineVolume,EnginePower,Price" > "$OUTFILE"
for ((page=START_PAGE; page<=END_PAGE; page+=PAGE_STEP)); do
PAGE_URL="https://www.autobazar.cz/inzeraty/?first=${page}"
echo "Processing page: ${PAGE_URL}"
page_content=$(curl -s "$PAGE_URL" | tr '\n' ' ')
links=$(echo "$page_content" | grep -oP '<a href="\/\/[^"]+\/inzerat\/[^"]+" class="fnd ti">')
while IFS= read -r link_line; do
relative_url=$(echo "$link_line" | grep -oP 'href="\/\/[^"]+' | cut -d'"' -f2)
FULL_URL="https:${relative_url}"
echo ">> Processing listing: ${FULL_URL}"
# Downloaded html is encoded in windows-1250
# -> gets re-encoded utf-8
# Then all newlines get removed
ad_content=$(curl -s "$FULL_URL" | iconv -f WINDOWS-1250 -t UTF-8 | tr -d '\n')
# It seems like a car name can contain classes with single quotes (') but also double (")
# -> Test them both
car_name=$(echo "$ad_content" | grep -oP '<span class="title"><h1>\K[^<]+' | xargs)
second_car_name=$(echo "$ad_content" | grep -oP '<span class='\''title'\''><h1>\K[^<]+' | xargs)
[ -z "$car_name" ] && car_name="$second_car_name"
year=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Rok výroby:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
driven=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Najeto:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
fuel=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Palivo:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
engine_volume=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Objem motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
engine_power=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Výkon motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
price=$(echo "$ad_content" | grep -oP '(?<=<span class="cena2"> <b>).*?(?=<\/b>)' | sed 's/&nbsp;//g' | xargs)
# Later used for filtering invalid data for easier search
[ -z "$car_name" ] && car_name="N/A"
[ -z "$year" ] && year="N/A"
[ -z "$driven" ] && driven="N/A"
[ -z "$fuel" ] && fuel="N/A"
[ -z "$engine_volume" ] && engine_volume="N/A"
[ -z "$engine_power" ] && engine_power="N/A"
[ -z "$price" ] && price="N/A"
echo "\"${FULL_URL}\",\"${car_name}\",\"${year}\",\"${driven}\",\"${fuel}\",\"${engine_volume}\",\"${engine_power}\",\"${price}\"" >> "$OUTFILE"
done <<< "$links"
done
echo "Scraping completed. Results saved in ${OUTFILE}"