Wie-geht-was? Tipps, Tricks, und Skripte für DeepGreen¶
Sammelsurium kleinerer Skripte für DeepGreen / Elasticsearch¶
#! /usr/bin/env bash
dginst="www.oa-deepgreen.de"
user="green"
host="sl64.kobv.de"
elhost="localhost:9200/jper"
elurl="${elhost}/_mappings"
stepsize=400
maps=$(ssh -n "${user}@${host}" "curl -s '${elurl}'" | jq '.jper.mappings | keys' | grep -o "routed20[0-9]\{4\}\|failed")
mkdir -p "${dginst}_$(date +%F)"
for mapping in ${maps}; do
outf="${dginst}_$(date +%F)/${mapping}.json"
total=$(ssh -n "${user}@${host}" "curl -s '${elhost}/${mapping}/_count'" | jq '.count')
printf "Found %d records in '%s'.\n" "${total}" "${mapping}"
>"${outf}"
if [ ${total} -gt 0 ]; then
for cursor in $(seq 0 ${stepsize} ${total}); do
json=$(ssh -n "${user}@${host}" "curl -s '${elhost}/${mapping}/_search?from=${cursor}&size=${stepsize}'")
echo "${json}" |\
jq -c '.hits.hits[]._source | { analysis:.analysis_date,
pubid:.provider.id,
publisher:.metadata.publisher,
doi:.metadata.identifier[] | select(.type=="doi").id,
pubdate:.metadata.publication_date,
issn:.issn_data,
id:.id }' >>"${outf}"
done
cat "${outf}" | jq -c '[.analysis,.pubid,.publisher,.doi,.pubdate,.issn,.id]' |\
sed 's/^.\(.*\).$/\1/' | sort -t',' -k 1 >"${outf%.json}.csv"
fi
done
# resulting CSV header line (not in .csv files included to support easier concatenation!):
# Analysis Date, Publisher-ID, Publisher Name, DOI, Publication Date, ISSNs, Elastic-ID (DeepGreen)
#
# example result:
# .../some/path$ ls www.oa-deepgreen.de_2019-06-19/*.csv
# www.oa-deepgreen.de_2019-06-19/failed.csv www.oa-deepgreen.de_2019-06-19/routed201806.csv
# www.oa-deepgreen.de_2019-06-19/routed201610.csv www.oa-deepgreen.de_2019-06-19/routed201807.csv
# www.oa-deepgreen.de_2019-06-19/routed201612.csv www.oa-deepgreen.de_2019-06-19/routed201808.csv
# www.oa-deepgreen.de_2019-06-19/routed201701.csv www.oa-deepgreen.de_2019-06-19/routed201809.csv
# www.oa-deepgreen.de_2019-06-19/routed201702.csv www.oa-deepgreen.de_2019-06-19/routed201810.csv
# www.oa-deepgreen.de_2019-06-19/routed201705.csv www.oa-deepgreen.de_2019-06-19/routed201811.csv
# www.oa-deepgreen.de_2019-06-19/routed201706.csv www.oa-deepgreen.de_2019-06-19/routed201812.csv
# www.oa-deepgreen.de_2019-06-19/routed201709.csv www.oa-deepgreen.de_2019-06-19/routed201901.csv
# www.oa-deepgreen.de_2019-06-19/routed201711.csv www.oa-deepgreen.de_2019-06-19/routed201902.csv
# www.oa-deepgreen.de_2019-06-19/routed201712.csv www.oa-deepgreen.de_2019-06-19/routed201903.csv
# www.oa-deepgreen.de_2019-06-19/routed201801.csv www.oa-deepgreen.de_2019-06-19/routed201904.csv
# www.oa-deepgreen.de_2019-06-19/routed201802.csv www.oa-deepgreen.de_2019-06-19/routed201905.csv
# www.oa-deepgreen.de_2019-06-19/routed201803.csv www.oa-deepgreen.de_2019-06-19/routed201906.csv
Der Inhalt der CSV-Dateien ist ebenfalls chronologisch angeordnet. Anhand der letzten CSV-Spalte
(Elastic-ID
) können alle Datensätze eines Verlages aus der ES-Datenbank gelöscht werden.
#! /usr/bin/env bash
ssh -n "green@sl64.kobv.de" "curl -s 'localhost:9200/jper/contentlog/_search?pretty&size=1000'" |\
jq -rc '.hits.hits[]._source.user' | sort -u | while read user; do
printf "%s\t" "${user}"
ssh -n "green@sl64.kobv.de" "curl -s 'localhost:9200/jper/account/${user}'" | jq '._source.email'
done
# example output (2019-06-20):
# 03fad0a0ae5e4918964485663585adbe "UBEN@deepgreen.org"
# 272dd77deffc425b961f3ec4fb8a3993 "GFZPO@deepgreen.org"
# c0675e0b01864161b77f4654f6910577 "aUBEN@deepgreen.org"
#! /usr/bin/env bash
grep "id=" Work/Data/Util/api_keys_deepgreen_prototyp.txt | while read line; do
rid=$(echo "${line}"|grep -o "[0-9a-f]\{32\}")
url="localhost:9200/jper/routed201901,routed201902,routed201903,routed201904,routed201905,routed201906/_count?q=${rid}"
bibid=$(echo "${line}" | grep -o "id=.[A-Z]\{3,5\}." | grep -o "[A-Z]\+")
printf "%5s (%s) %s\n" "${bibid}" "${rid}" "$(ssh -n "green@sl64.kobv.de" "curl -s '${url}'" | jq '.count')"
done
# example output (2019-06-20):
# BISOL (****************837bc06e522a31c5) 11
# FHH (****************ad1cadda4b834756) 4
# FHKN (****************8978652dca93083c) 0
# FUB (****************87af5b02004c2473) 39
# HSG (****************a7b664dacf4c9de7) 1
# SLUB (****************a02fecadff2533b6) 38
# SUBGO (****************85a3b1220493e1ca) 26
# THULB (****************a3b6da7f50c3a465) 18
# TUBB (****************b1154158cf6220cb) 37
# TUCB (****************bff2b95945caa775) 11
# TUDA (****************b243e619330f6810) 9
# TUM (****************bcfc89eedbf36164) 25
# UBA (****************8c08802501bcff46) 4
# UBB (****************9b48886bdef45515) 0
# UBEN (****************964485663585adbe) 100
# UBGRW (****************b366a444b5b9bd20) 7
# UBHE (****************85d320f7e6bb825e) 18
# UBHUB (****************940560f983d44ba7) 51
# UBKO (****************bc7b184a88dd8cf5) 5
# UBPA (****************911051629d3a7284) 1
# UBR (****************9c20df52e8349f5b) 11
# UBS (****************b212c216363bea48) 15
# UBTUE (****************ad6595efb536e7cf) 20
# UBWH (****************977c2ec49c527be9) 2
# UGHE (****************b2936250ef382eb0) 16