Added scrape script
This commit is contained in:
parent
85815a81b5
commit
c753acb6f5
64
scrape.sh
Executable file
64
scrape.sh
Executable file
@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "$#" -ne 2 ]; then
|
||||
echo "Usage: $0 START_PAGE END_PAGE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
START_PAGE=$1
|
||||
END_PAGE=$2
|
||||
|
||||
PAGE_STEP=15
|
||||
|
||||
OUTFILE="results.csv"
|
||||
|
||||
echo "URL,Year,Driven,Fuel,EngineVolume,EnginePower,Price" > "$OUTFILE"
|
||||
|
||||
for ((page=START_PAGE; page<=END_PAGE; page+=PAGE_STEP)); do
|
||||
PAGE_URL="https://www.autobazar.cz/inzeraty/?first=${page}"
|
||||
echo "Processing page: ${PAGE_URL}"
|
||||
|
||||
page_content=$(curl -s "$PAGE_URL" | tr '\n' ' ')
|
||||
|
||||
links=$(echo "$page_content" | grep -oP '<a href="\/\/[^"]+\/inzerat\/[^"]+" class="fnd ti">')
|
||||
|
||||
while IFS= read -r link_line; do
|
||||
relative_url=$(echo "$link_line" | grep -oP 'href="\/\/[^"]+' | cut -d'"' -f2)
|
||||
FULL_URL="https:${relative_url}"
|
||||
echo ">> Processing listing: ${FULL_URL}"
|
||||
|
||||
# Downloaded html is encoded in windows-1250
|
||||
# -> gets re-encoded utf-8
|
||||
# Then all newlines get removed
|
||||
ad_content=$(curl -s "$FULL_URL" | iconv -f WINDOWS-1250 -t UTF-8 | tr -d '\n')
|
||||
|
||||
# It seems like a car name can contain classes with single quotes (') but also double (")
|
||||
# -> Test them both
|
||||
car_name=$(echo "$ad_content" | grep -oP '<span class="title"><h1>\K[^<]+' | xargs)
|
||||
second_car_name=$(echo "$ad_content" | grep -oP '<span class='\''title'\''><h1>\K[^<]+' | xargs)
|
||||
[ -z "$car_name" ] && car_name="$second_car_name"
|
||||
|
||||
year=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Rok výroby:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
||||
driven=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Najeto:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
||||
fuel=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Palivo:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
||||
engine_volume=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Objem motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
||||
engine_power=$(echo "$ad_content" | grep -oP '(?<=<td class="ti">Výkon motoru:</td><td>).*?(?=</td>)' | sed 's/<[^>]*>//g' | xargs)
|
||||
price=$(echo "$ad_content" | grep -oP '(?<=<span class="cena2"> <b>).*?(?=<\/b>)' | sed 's/ //g' | xargs)
|
||||
|
||||
# Later used for filtering invalid data for easier search
|
||||
[ -z "$car_name" ] && car_name="N/A"
|
||||
[ -z "$year" ] && year="N/A"
|
||||
[ -z "$driven" ] && driven="N/A"
|
||||
[ -z "$fuel" ] && fuel="N/A"
|
||||
[ -z "$engine_volume" ] && engine_volume="N/A"
|
||||
[ -z "$engine_power" ] && engine_power="N/A"
|
||||
[ -z "$price" ] && price="N/A"
|
||||
|
||||
echo "\"${FULL_URL}\",\"${car_name}\",\"${year}\",\"${driven}\",\"${fuel}\",\"${engine_volume}\",\"${engine_power}\",\"${price}\"" >> "$OUTFILE"
|
||||
done <<< "$links"
|
||||
|
||||
done
|
||||
|
||||
echo "Scraping completed. Results saved in ${OUTFILE}"
|
||||
|
Loading…
x
Reference in New Issue
Block a user