#!/bin/sh

# ===========================================================================
#
#                            PUBLIC DOMAIN NOTICE
#            National Center for Biotechnology Information (NCBI)
#
#  This software/database is a "United States Government Work" under the
#  terms of the United States Copyright Act.  It was written as part of
#  the author's official duties as a United States Government employee and
#  thus cannot be copyrighted.  This software/database is freely available
#  to the public for use. The National Library of Medicine and the U.S.
#  Government do not place any restriction on its use or reproduction.
#  We would, however, appreciate having the NCBI and the author cited in
#  any work or product based on this material.
#
#  Although all reasonable efforts have been taken to ensure the accuracy
#  and reliability of the software and data, the NLM and the U.S.
#  Government do not and cannot warrant the performance or results that
#  may be obtained by using this software or data. The NLM and the U.S.
#  Government disclaim all warranties, express or implied, including
#  warranties of performance, merchantability or fitness for any particular
#  purpose.
#
# ===========================================================================
#
# File Name:  nquire
#
# Author:  Jonathan Kans, Aaron Ucko
#
# Version Creation Date:   03/28/2020
#
# ==========================================================================

# pth must contain cacert.pem certificate (previously within aux/lib/perl5/Mozilla/CA/ subdirectory)

pth=$( dirname "$0" )

case ":$PATH:" in
  *:"$pth":* )
    ;;
  * )
    PATH="$PATH:$pth"
    export PATH
    ;;
esac

# ignore arguments that previously chose between this shell script redesign and original Perl implementation

while [ "$#" -ne 0 ]
do
  case "$1" in
    -newmode | -oldmode )
      shift
      ;;
    * )
      break
      ;;
  esac
done

# help and example texts

PrintHelp() {

  version=$( einfo -newmode -version )
  echo "nquire $version"
  cat << "EOF"

Query Commands

  -url    Sends query with HTTP POST
  -get    Uses HTTP GET instead of POST

  -lst    Lists contents of FTP site
  -ftp    Retrieves data from FTP site

File Downloads

  -dwn    Downloads FTP data to file
  -asp    Uses Aspera download, if configured

Examples

  nquire -url https://eutils.ncbi.nlm.nih.gov entrez/eutils einfo.fcgi |
  xtract -pattern DbList -sep "\n" -element DbName | sort -f

  nquire -url https://eutils.ncbi.nlm.nih.gov entrez/eutils elink.fcgi \
    -dbfrom pubmed -db pubmed -cmd neighbor -linkname pubmed_pubmed -id 2539356

  nquire -get https://icite.od.nih.gov/api/pubs -pmids 1937004 10838572 |
  transmute -j2x |
  xtract -pattern opt -element cited_by references |
  accn-at-a-time

  nquire -get "http://collections.mnh.si.edu/services/resolver/resolver.php" \
    -voucher "Birds:625456" |
  xtract -pattern Result -element ScientificName Country

  nquire -get http://w1.weather.gov/xml/current_obs/KSFO.xml |
  xtract -pattern current_observation -tab "\n" \
    -element weather temp_f wind_dir wind_mph

  nquire -get https://api.bigdatacloud.net/data/reverse-geocode-client \
    -latitude 41.7909 -longitude "\-87.5994" |
  transmute -j2x |
  xtract -pattern opt -element countryCode \
    -block administrative -if description -starts-with "state " -element name \
    -block administrative -if description -starts-with "city " -element name |
  tr '\t' '\n'

  nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
  grep acen | cut -f 1,2,6,7 | awk '/^X\t/'

  nquire -lst ftp://nlmpubs.nlm.nih.gov online/mesh/MESH_FILES/xmlmesh

  nquire -dwn ftp.nlm.nih.gov online/mesh/MESH_FILES/xmlmesh desc2021.zip

  for sect in baseline updatefiles
  do
    nquire -lst ftp.ncbi.nlm.nih.gov pubmed "$sect" |
    grep -v ".md5" | grep "xml.gz" |
    skip-if-file-exists | tee /dev/tty |
    nquire -asp ftp.ncbi.nlm.nih.gov pubmed "$sect"
  done

  nquire -raw -get http://golr-aux.geneontology.io/solr/select \
    -fq document_category:\"ontology_class\" -q *:* -fq id:\"GO:0030182\" \
    -wt json |
  transmute -j2x |
  xtract -pattern opt -element neighborhood_limited_graph_json topology_graph_json |
  transmute -j2x |
  xtract -pattern opt -num nodes edges

EOF
}

# nquire -get 'http://golr-aux.geneontology.io/solr/select?fq=document_category:"ontology_class"&q=*:*&fq=id:"GO:0030182"&wt=json'

PrintExamples() {

  version=$( einfo -newmode -version )
  echo "nquire $version"
  cat << "EOF"

Medical Subject Headings

  nquire -get "http://id.nlm.nih.gov/mesh/sparql" \
    -query "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \
      SELECT DISTINCT ?class FROM <http://id.nlm.nih.gov/mesh> \
      WHERE { ?s rdf:type ?class } ORDER BY ?class" |
  xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]"

MeSH Predicates

  nquire -get "http://id.nlm.nih.gov/mesh/sparql" \
    -query "SELECT DISTINCT ?p FROM <http://id.nlm.nih.gov/mesh> \
      WHERE { ?s ?p ?o } ORDER BY ?p" |
  xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]"

WikiData Predicate List

  nquire -url "https://query.wikidata.org/sparql" \
    -query "SELECT ?property ?propertyType ?propertyLabel \
      ?propertyDescription ?propertyAltLabel WHERE { \
      ?property wikibase:propertyType ?propertyType . SERVICE wikibase:label \
      { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } } \
      ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), 'P')))" |
  xtract -pattern result -first "uri[http://www.wikidata.org/entity/|]" -first literal

Vitamin Binding Site

  nquire -get "http://www.wikidata.org/entity/Q22679758" |
  transmute -j2x |
  xtract -pattern entities -group claims -block P527 -element "value/id"

Children of JS Bach

  nquire -url "https://query.wikidata.org/sparql" \
    -query "SELECT ?child ?childLabel WHERE \
      { ?child wdt:P22 wd:Q1339. SERVICE wikibase:label \
        { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" |
  xtract -pattern result -block binding -if "@name" -equals childLabel -element literal

Eye Color Frequency

  nquire -url "https://query.wikidata.org/sparql" \
    -query "SELECT ?eyeColorLabel WHERE \
      { ?human wdt:P31 wd:Q5. ?human wdt:P1340 ?eyeColor. SERVICE wikibase:label \
        { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" |
  xtract -pattern result -element literal |
  sort-uniq-count-rank

EOF
}

# check for help commands

if [ $# -gt 0 ]
then
  case "$1" in
    -version )
      version=$( einfo -newmode -version )
      echo "$version"
      exit 0
      ;;
    -h | -help | --help )
      PrintHelp
      exit 0
      ;;
    -examples )
      PrintExamples
      exit 0
      ;;
  esac
fi

# allow environment variable to set preference for curl or wget (undocumented)

helper=""

if [ -n "${NQUIRE_HELPER}" ]
then
  helper="${NQUIRE_HELPER}"
fi

# allow environment variable to set curl connection timeout (undocumented)

timeout=20

if [ -n "${NQUIRE_TIMEOUT}" ]
then
  timeout="${NQUIRE_TIMEOUT}"
fi

# check for leading flags

debug=false
log=false
raw=false

while [ $# -gt 0 ]
do
  case "$1" in
    -debug )
      debug=true
      shift
      ;;
    -log )
      log=true
      shift
      ;;
    -raw )
      raw=true
      shift
      ;;
    -curl )
      # override setting from environment variable (undocumented)
      helper="curl"
      shift
      ;;
    -wget )
      # override setting from environment variable (undocumented)
      helper="wget"
      shift
      ;;
    * )
      # allows while loop to check for multiple flags
      break
      ;;
  esac
done

# check for presence of curl or wget

case "$helper" in
  curl | CURL | CUrl | Curl | cUrl | cURL )
    binary=$( command -v curl )
    if [ ! -x "$binary" ]
    then
      echo "ERROR: required curl helper is not present" >&2
      exit 1
    fi
    ;;
  wget | WGET | WGet | Wget | wGet | wGET )
    binary=$( command -v wget )
    if [ ! -x "$binary" ]
    then
      echo "ERROR: required wget helper is not present" >&2
      exit 1
    fi
    ;;
  * )
    binary=$( command -v curl )
    if [ ! -x "$binary" ]
    then
      binary=$( command -v wget )
    fi
    if [ ! -x "$binary" ]
    then
      echo "ERROR: nquire requires either curl or wget" >&2
      exit 1
    fi
    ;;
esac

# subset of perl -MURI::Escape -ne 'chomp;print uri_escape($_),"\n"'

Escape() {

  echo "$1" |
  sed -e "s/%/%25/g" \
      -e "s/!/%21/g" \
      -e "s/#/%23/g" \
      -e "s/&/%26/g" \
      -e "s/'/%27/g" \
      -e "s/*/%2A/g" \
      -e "s/+/%2B/g" \
      -e "s/,/%2C/g" \
      -e "s|/|%2F|g" \
      -e "s/:/%3A/g" \
      -e "s/;/%3B/g" \
      -e "s/=/%3D/g" \
      -e "s/?/%3F/g" \
      -e "s/@/%40/g" \
      -e "s/|/%7C/g" \
      -e "s/ /%20/g" |
  sed -e 's/\$/%24/g' \
      -e 's/(/%28/g' \
      -e 's/)/%29/g' \
      -e 's/</%3C/g' \
      -e 's/>/%3E/g' \
      -e 's/\[/%5B/g' \
      -e 's/\]/%5D/g' \
      -e 's/\^/%5E/g' \
      -e 's/{/%7B/g' \
      -e 's/}/%7D/g'
}

# initialize variables

mode=""

url=""
sls=""

arg=""
amp=""
cmd=""
pfx=""

# optionally include nextra.sh script, if present, for internal NCBI maintenance functions (undocumented)

if [ -f "$pth"/nextra.sh ]
then
  # dot command is equivalent of "source"
  . "$pth"/nextra.sh
fi

# get extraction method

if [ $# -gt 0 ]
then
  case "$1" in
    -url | -get | -lst | -ftp | -dwn | -asp )
      mode="$1"
      shift
      ;;
    -* )
      exec >&2
      echo "$0: Unrecognized option $1" >&2
      exit 1
      ;;
    * )
      echo "$0: Missing command $1" >&2
      exit 1
      ;;
  esac
fi

# collect URL directory components

while [ $# -gt 0 ]
do
  case "$1" in
    -ncbi )
      # shortcut for NCBI base (undocumented)
      shift
      url="https://www.ncbi.nlm.nih.gov"
      sls="/"
      ;;
    -eutils )
      # shortcut for EUtils base (undocumented)
      shift
      url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
      sls="/"
      ;;
    -pubchem )
      # shortcut for PubChem REST base (undocumented)
      shift
      url="https://pubchem.ncbi.nlm.nih.gov/rest/pug"
      sls="/"
      ;;
    -* )
      # otherwise leading dash indicates end of path, switch to arguments
      break
      ;;
    * )
      dir="$1"
      # remove trailing slash directory delimiter
      dir=${dir%/}
      shift
      url="$url$sls$dir"
      sls="/"
      ;;
  esac
done

# collect argument tags paired with (escaped) values

while [ $# -gt 0 ]
do
  case "$1" in
    -* )
      cmd="$1"
      # remove leading dash from argument
      cmd=${cmd#-}
      # add argument and command
      arg="$arg$amp$cmd"
      # subsequent commands preceded by ampersand
      amp="&"
      # precede first value (if any) with equal sign
      pfx="="
      shift
      ;;
    * )
      val="$1"
      # remove initial backslash used to protect leading minus sign
      val=${val#\\}
      # URL encoding
      if [ "$raw" = true ]
      then
        val=$( echo "$val" | sed -e "s/&/%26/g" -e "s/=/%3D/g" )
      else
        val=$( Escape "$val" )
      fi
      arg="$arg$pfx$val"
      # concatenate run of values with commas
      pfx=","
      shift
      ;;
  esac
done

# debugging output to stderr

if [ "$debug" = true ]
then
  if [ -z "$arg" ]
  then
    echo "PTH $pth" >&2
    echo "URL $url" >&2
  elif [ "$mode" = "-url" ]
  then
    echo "curl -fsSL \"$url\" -d \"$arg\"" >&2
  elif [ "$mode" = "-get" ]
  then
    echo "curl -fsSL \"$url?$arg\"" >&2
  else
    echo "PTH $pth" >&2
    echo "URL $url" >&2
    echo "ARG $arg" >&2
  fi
  exit 0
fi

# pause if Entrez Utilities server to avoid exceeding request frequency limit

slow=false

case "${SLOW_EDIRECT}" in
  "" | [FfNn]* | 0 | [Oo][Ff][Ff] )
    ;;
  * )
    slow=true
    ;;
esac

hasperl=$( command -v perl )

case $url in
  *"dev.ncbi.nlm.nih.gov/entrez/eutils/"* | *"internal.ncbi.nlm.nih.gov/entrez/eutils/"* )
    if [ "$slow" = true ]
    then
      sleep 1
    elif [ -x "$hasperl" ]
    then
      perl -MTime::HiRes -e 'Time::HiRes::usleep(1000)'
    fi
    ;;
  *"/entrez/eutils/"* )
    if [ "$slow" = true ]
    then
      sleep 1
    elif [ -x "$hasperl" ]
    then
      case $arg in
        *"api_key="* )
          perl -MTime::HiRes -e 'Time::HiRes::usleep(110000)'
          ;;
        * )
          perl -MTime::HiRes -e 'Time::HiRes::usleep(350000)'
          ;;
      esac
    else
      sleep 1
    fi
    ;;
esac

# set up colors for error report

ColorSetup() {

  if [ -z "$TERM" ] || [ ! -t 2 ]
  then
    red_fg=""
    blue_fg=""
    orig_colors=""
  elif command -v tput >/dev/null
  then
    red_fg="$(tput setaf 1)"
    blue_fg="$(tput setaf 4)"
    orig_colors="$(tput op)"
  else
    # assume ANSI
    escape="$(printf '\033')"
    red_fg="$escape[31m"
    blue_fg="$escape[34m"
    orig_colors="$escape[39;49m"
  fi
}

# common function to execute curl or wget command

SendRequest() {

  ColorSetup
  when=$( date )

  case "$binary" in
    */curl )
      if [ "$log" = true ]
      then
        echo "${blue_fg}$@${orig_colors}" >&2
      fi

      temp=$(mktemp /tmp/NQUIRE_HEADER.XXXXXXXXX)

      if [ -f "$pth"/cacert.pem ]
      then
        curl --http1.0 --connect-timeout 20 -fsSL --capath "$pth"/cacert.pem -D "$temp" "$@"
      else
        curl --http1.0 --connect-timeout 20 -fsSL -D "$temp" "$@"
      fi

      # capture and check curl return value
      res=$?
      if [ "$res" -ne 0 ]
      then
        # report failure
        echo "${red_fg}ERROR: curl command failed ( $when ) with: $res${orig_colors}" >&2
        echo "${blue_fg}$@${orig_colors}" >&2
        # show return code in first line of header
        head -n 1 "$temp" >&2
      fi

      rm "$temp"
      ;;
    */wget )
      if [ "$log" = true ]
      then
        echo "${blue_fg}$@${orig_colors}" >&2
      fi

      temp=$(mktemp /tmp/NQUIRE_HEADER.XXXXXXXXX)

      full_output=""
      if [ -f "$pth"/cacert.pem ]
      then
        wget -qS -O - --ca-certificate="$pth"/cacert.pem "$@" 2> "$temp"
      else
        wget -qS -O - --no-check-certificate "$@" 2> "$temp"
      fi

      # capture and check wget return value
      res=$?
      if [ "$res" -ne 0 ]
      then
        # report failure
        echo "${red_fg}ERROR: wget command failed ( $when ) with: $res${orig_colors}" >&2
        echo "${blue_fg}$@${orig_colors}" >&2
        # show return code in first line of header
        head -n 1 "$temp" >&2
      fi

      rm "$temp"
      ;;
  esac
}

# can use Aspera if installed

APPPATH=""
KEYPATH=""
KEYNAME=asperaweb_id_dsa.openssh

HasAspera() {

  case "$( uname -s )" in
    Darwin )
      sysdir='/Applications/Aspera Connect.app/Contents/Resources'
      sysdir2=/bin
      userdir=$HOME$sysdir
      ;;
    CYGWIN_NT* )
      sysdir='/cygdrive/c/Program Files/Aspera/Aspera Connect/bin'
      sysdir2='/cygdrive/c/Program Files (x86)/Aspera/Aspera Connect/bin'
      userdir="$( cygpath -H )/$USER/AppData/Local/Programs/Aspera/Aspera Connect/bin"
      ;;
    * )
      sysdir=/opt/aspera/bin
      sysdir2=/bin
      userdir=$HOME/.aspera/connect/bin
      ;;
  esac
  for d in "$sysdir" "$sysdir2" "$userdir"
  do
    if "$d/ascp" --version 2>&1 | grep '^Aspera' >/dev/null
    then
      APPPATH=$d
      break
    fi
  done
  if [ -z "$APPPATH" ]  &&  ascp --version 2>&1 | grep '^Aspera' >/dev/null
  then
    APPPATH=$( type -path ascp )
    APPPATH=$( dirname "$APPPATH" )
  fi
  if [ -z "$APPPATH" ]
  then
    return 1
  fi

  for d in "$APPPATH" "$sysdir" "$sysdir2" "$userdir"
  do
    if [ -f "$d/../etc/$KEYNAME" ]
    then
      KEYPATH=$d/../etc
      break
    elif [ -f "$d/$KEYNAME" ]
    then
      KEYPATH=$d
      break
    fi
  done
  if [ -z "$KEYPATH" ]
  then
    return 1
  fi

  return 0
}

if [ "$mode" = "-asp" ]
then
  HasAspera
  if [ "$?" = 1 ]
  then
    # Aspera not found, revert to download with FTP
    mode="-dwn"
  else
    # add colon before first slash in URL for Aspera
    url=$(echo "$url" | sed -e 's|/|:/|')
  fi
fi

# common method for file download

failed=""

DownloadOneFile() {

  urlfl="$1"
  fl="$2"

  if [ ! -f "$fl" ]
  then
    # only download if local file does not already exist
    case "$mode" in
      -dwn )
        SendRequest "$urlfl" > "$fl"
        ;;
      -asp )
        "$APPPATH/ascp" -T -q -k 1 -l 500m -i "$KEYPATH/$KEYNAME" \
        "anonftp@$urlfl" "."
        ;;
    esac
    if [ ! -f "$fl" ]
    then
      # report failure to download requested file
      failed="$failed\n$fl"
    fi
  fi
}

# send request with method-specific arguments

case "$mode" in
  -url )
    case "$binary" in
      */curl )
        if [ -n "$arg" ]
        then
          SendRequest "$url" -d "$arg"
        else
          SendRequest "$url"
        fi
        ;;
      */wget )
        if [ -n "$arg" ]
        then
          SendRequest --post-data="$arg" "$url"
        else
          SendRequest --post-data="" "$url"
        fi
        ;;
    esac
    ;;
  -get )
    if [ -n "$arg" ]
    then
      SendRequest "$url?$arg"
    else
      SendRequest "$url"
    fi
    ;;
  -lst )
    case "$binary" in
      */curl )
        SendRequest "$url/" |
        tr -s ' ' | tr ' ' '\t' | cut -f 9 | grep '.'
        ;;
      */wget )
        SendRequest "$url" |
        sed -e 's/<[^>]*>//g' | tr ' ' '\t' | cut -f 1 | grep '.'
        ;;
    esac
    ;;
  -ftp )
    if [ -t 0 ]
    then
      SendRequest "$url"
    else
      # read file names from stdin, URL contains base address of server
      while IFS=$'\t' read fl
      do
        SendRequest "$url/$fl"
      done
    fi
    ;;
  -dwn | -asp )
    if [ -t 0 ]
    then
      # file name is included in URL constructed from command line arguments
      fl=$( basename "$url" )
      DownloadOneFile "$url" "$fl"
    else
      # read file names from stdin, URL contains base address of server
      while IFS=$'\t' read fl
      do
        DownloadOneFile "$url/$fl" "$fl"
      done
    fi
    if [ -n "$failed" ]
    then
      echo -e "\nFAILED TO DOWNLOAD:\n$failed\n" >&2
      exit 1
    fi
    ;;
  * )
    echo "$0: Unrecognized option $1" >&2
    exit 1
    ;;
esac
