65 lines
2.5 KiB
Bash
Executable File
65 lines
2.5 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
if [ "$#" -ne 2 ]; then
|
|
echo "Usage: $0 START_PAGE END_PAGE"
|
|
exit 1
|
|
fi
|
|
|
|
|
|
START_PAGE=$1
|
|
END_PAGE=$2
|
|
|
|
PAGE_STEP=15
|
|
|
|
OUTFILE="results.csv"
|
|
|
|
echo "URL,Year,Driven,Fuel,EngineVolume,EnginePower,Price" > "$OUTFILE"
|
|
|
|
for ((page=START_PAGE; page<=END_PAGE; page+=PAGE_STEP)); do
|
|
PAGE_URL="https://www.autobazar.cz/inzeraty/?first=${page}"
|
|
echo "Processing page: ${PAGE_URL}"
|
|
|
|
page_content=$(curl -s "$PAGE_URL" | tr '\n' ' ')
|
|
|
|
links=$(echo "$page_content" | grep -oP '<a href="\/\/[^"]+\/inzerat\/[^"]+" class="fnd ti">')
|
|
|
|
while IFS= read -r link_line; do
|
|
relative_url=$(echo "$link_line" | grep -oP 'href="\/\/[^"]+' | cut -d'"' -f2)
|
|
FULL_URL="https:${relative_url}"
|
|
echo ">> Processing listing: ${FULL_URL}"
|
|
|
|
# Downloaded html is encoded in windows-1250
|
|
# -> gets re-encoded utf-8
|
|
# Then all newlines get removed
|
|
ad_content=$(curl -s "$FULL_URL" | iconv -f WINDOWS-1250 -t UTF-8 | tr -d '\n')
|
|
|
|
# It seems like a car name can contain classes with single quotes (') but also double (")
|
|
# -> Test them both
|
|
car_name=$(echo "$ad_content" | grep -oP '<span class="title"><h1>\K[^<]+' | xargs)
|
|
second_car_name=$(echo "$ad_content" | grep -oP '<span class='\''title'\''><h1>\K[^<]+' | xargs)
|
|
[ -z "$car_name" ] && car_name="$second_car_name"
|
|
|
|
year=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Rok výroby:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
|
driven=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Najeto:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
|
fuel=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Palivo:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
|
engine_volume=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Objem motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
|
engine_power=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Výkon motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
|
price=$(echo "$ad_content" | grep -oP '(?<=<span class="cena2"> <b>).*?(?=<\/b>)' | sed 's/ //g' | xargs)
|
|
|
|
# Later used for filtering invalid data for easier search
|
|
[ -z "$car_name" ] && car_name="N/A"
|
|
[ -z "$year" ] && year="N/A"
|
|
[ -z "$driven" ] && driven="N/A"
|
|
[ -z "$fuel" ] && fuel="N/A"
|
|
[ -z "$engine_volume" ] && engine_volume="N/A"
|
|
[ -z "$engine_power" ] && engine_power="N/A"
|
|
[ -z "$price" ] && price="N/A"
|
|
|
|
echo "\"${FULL_URL}\",\"${car_name}\",\"${year}\",\"${driven}\",\"${fuel}\",\"${engine_volume}\",\"${engine_power}\",\"${price}\"" >> "$OUTFILE"
|
|
done <<< "$links"
|
|
|
|
done
|
|
|
|
echo "Scraping completed. Results saved in ${OUTFILE}"
|
|
|