|
| 1 | + |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# # User Testing gitnet |
| 5 | +# |
| 6 | +# #### *June 2016, using version 0.0.8 of gitnet on testpypi* |
| 7 | +# |
| 8 | + |
| 9 | +# |
| 10 | +# ## *Introduction* |
| 11 | +# |
| 12 | + |
| 13 | +# To follow this exercise successfully, you need to have: |
| 14 | +# - Python 3 (Anacondas 3.5 is the best bet) |
| 15 | +# - Git (you can update git by running in the terminal: pip install git --upgrade) |
| 16 | +# - The current version of gitnet is 0.0.8. |
| 17 | +# - NetworkX (you can install by running in ther terminal: pip install networkx) |
| 18 | +# - Matplotlib (you can install by running in the terminal: pip install matplotlib) |
| 19 | +# - Pygraphviz (not neccessarily required, only for the default layout, which happens to be the best one we could find) |
| 20 | +# |
| 21 | +# **Note:** Unfortunately, Pygraphviz can potentially be difficult to install on Windows. If pip is not able to find vcvarsall.bat, then avoid editing the environment variables and use this website: http://www.lfd.uci.edu/~gohlke/pythonlibs/ to download the binary for Python 3.4. Unfortunately, although Pygraphviz will install, there still may be errors with the graph output. |
| 22 | +# |
| 23 | +# Installing gitnet with pip will automatically install bash if you do not already have it installed |
| 24 | +# To install gitnet, open a terminal window and type: |
| 25 | +# |
| 26 | +# `pip install -i https://testpypi.python.org/pypi gitnet` |
| 27 | + |
| 28 | +# For all sections of this exercise, you will need to use the following libraries: |
| 29 | + |
| 30 | +import os |
| 31 | +# import pygraph # Needed for defaults used by quickplot, if you can't install, use layout='spring'. |
| 32 | +import gitnet as gn |
| 33 | +import networkx as nx |
| 34 | +import matplotlib.pyplot as plt |
| 35 | + |
| 36 | + |
| 37 | +# ## *1. Write-Good Repo* |
| 38 | + |
| 39 | +# For this exercise, we are going to use the project: https://github.com/btford/write-good |
| 40 | +# |
| 41 | +# In a new terminal window, type: |
| 42 | +# |
| 43 | +# `git clone https://github.com/btford/write-good.git` |
| 44 | +# |
| 45 | +# OR open the page in a browser and download the zip folder. |
| 46 | + |
| 47 | +# Set the current working directory, so that all files created will be stored there. |
| 48 | +# The best bet is to create a folder named 'temp' on your desktop. |
| 49 | +os.chdir('path') |
| 50 | + |
| 51 | +# Insert the path to the write-good folder on your machine. |
| 52 | +mylogs = gn.get_log('path') |
| 53 | +# You can generate a network using any two tags that exist in the log. For a list of tags, just call .attributes() on your log object. |
| 54 | +graph = mylogs.generate_network('author', 'files') |
| 55 | +# Quickplot is a preset function that can be used to quickly visualize a network. |
| 56 | +graph.quickplot('write_good_net.pdf', layout = 'spring') |
| 57 | + |
| 58 | +# You can get a list of all of the values of any tag in the log object. |
| 59 | +# First, lets take a look at all of the possible tags. |
| 60 | +print(mylogs.attributes()) |
| 61 | +# Now, lets print that list of values. Choose one of the tags from above. |
| 62 | +print(mylogs.vector('date')) |
| 63 | + |
| 64 | + |
| 65 | +# ## *2. NetworkX* |
| 66 | + |
| 67 | +# For this exercise, we are going to use this project: https://github.com/networkx/networkx |
| 68 | +# |
| 69 | +# In a new terminal window, type: |
| 70 | +# |
| 71 | +# `git clone https://github.com/networkx/networkx.git` |
| 72 | +# |
| 73 | +# OR open the page in a browser and download the zip folder. |
| 74 | + |
| 75 | +# First, we are going to create another log object. |
| 76 | +networkx_log = gn.get_log('path') |
| 77 | + |
| 78 | +# Now you can export the log as a TSV file. |
| 79 | +networkx_log.tsv(fname = 'networkx_data.tsv') |
| 80 | + |
| 81 | + |
| 82 | +# Take a minute to open this file and look at the contents. |
| 83 | +# |
| 84 | +# Notice that there are similar author names that use the same email address. |
| 85 | +# |
| 86 | +# **Hint:** since version 0.0.8, we have simplified the process of identifying duplicate authors. |
| 87 | +# Use `author_email_list` along with `detect_dup_emails` to find potentially duplicate authors. See the cheat sheet for more details. |
| 88 | + |
| 89 | +# Gitnet cannot automatically predict when a single author uses two different names to commit to a repo. |
| 90 | +# For this reason, you may need to use replace one of their aliases with the other. |
| 91 | +replaced_netx = networkx_log.replace_val('author', 'aric', 'Aric Hagburg') |
| 92 | +# To make sure that this worked, just create a new TSV and look at the contents. |
| 93 | +replaced_netx.tsv(fname = 'replaced_data.tsv') |
| 94 | + |
| 95 | +# You can also create an edgelist from any two tags. |
| 96 | +# Check the possible tags. |
| 97 | +print(replaced_netx.attributes()) |
| 98 | +# Then use whichever ones you want to generate an edgelist. |
| 99 | +replaced_netx.write_edges('edgelist.txt', 'author', 'files') |
| 100 | + |
| 101 | + |
| 102 | +# *Optional:* you can now read this file into R as an edgelist |
| 103 | + |
| 104 | +# ## *3. Tensorflow* |
| 105 | + |
| 106 | +# For this exercise, we are going to use this project: https://github.com/tensorflow/tensorflow |
| 107 | +# |
| 108 | +# In a new terminal window, type: |
| 109 | +# |
| 110 | +# `git clone https://github.com/tensorflow/tensorflow.git` |
| 111 | +# |
| 112 | +# OR open the page in a browser and download the zip folder. |
| 113 | + |
| 114 | +# Lets start by creating a log object and a graph object, just as in the first exercise. |
| 115 | +logs_tensor = gn.get_log('path') |
| 116 | +graph_tensor = logs_tensor.generate_network('author', 'files') |
| 117 | + |
| 118 | +# For now, hold off on plotting or exporting, and try out some of the advanced methods |
| 119 | +# |
| 120 | +# Below are some usage examples for filter and ignore |
| 121 | + |
| 122 | +# Filter seems to have an error in IPYNB format. |
| 123 | + |
| 124 | +# Filter records based on the email domain. |
| 125 | +filtered_email = logs_tensor.filter('email', 'has', '@gmail.com') |
| 126 | +# Filter records based on the author name. |
| 127 | +filtered_author = logs_tensor.filter('author', 'equals', 'Martin Wicke') |
| 128 | +# Filter records based on commits that have occured after a certain date. |
| 129 | +filtered_date = logs_tensor.filter('date', 'since', 'Fri Jun 10 15:41:25 2016 -0400') |
| 130 | + |
| 131 | +# One of the limitations of filter is that because of the date-string format used by git, you need to type a pattern that at least partially matches the appearance of date-strings in the actually commits. |
| 132 | +# |
| 133 | +# However, it is still possible to use expressions such as `Fri June 10 *`, so there is still some room for flexible filtering. |
| 134 | + |
| 135 | +# Save one of these to a TSV file to check that it worked. |
| 136 | +filtered_author.tsv(fname = 'tensorflow_martin.tsv') |
| 137 | + |
| 138 | +# You can also ignore files and file edits that match any specified patter. |
| 139 | +# Ignore python files: |
| 140 | +ignore_python = logs_tensor.ignore('.py') |
| 141 | +# Ignore files with the _ prefix: |
| 142 | +ignore_prefix = logs_tensor.ignore('_*') |
| 143 | + |
| 144 | + |
| 145 | +# Keep in mind that both `filter` and `ignore` can have a significant impact on the network graph. |
| 146 | +# |
| 147 | +# It is best to use them sparingly, and only when it is certainly useful to remove certain information. |
| 148 | +# In many cases, it makes more sense to simply export the full graph and all its data (as a graphml file, for example) and then prune the data in R. |
| 149 | + |
| 150 | +# Save one of these to a TSV file to check that it worked. |
| 151 | +ignore_python.tsv(fname = 'nopy_data.tsv') |
| 152 | + |
| 153 | +# Try generating a network using one of these modified log objects, and compare it to previous results. |
| 154 | +modified_graph = ignore_python.generate_network('author', 'files') |
| 155 | +modified_graph.quickplot('modified_graph.pdf', layout = 'spring') # this runs very slow. |
| 156 | + |
| 157 | + |
| 158 | +# One note about the quickploy function is that it typically uses the `neato` layout from `matplotlib`. |
| 159 | +# |
| 160 | +# Here we are using the `spring` layout from `NetworkX`, but if you did get matplotlib installed, then you can simply leave |
| 161 | +# out the layout argument. It defaults to `neato`. |
| 162 | + |
| 163 | +# Try calling describe on both a log object and a graph object. |
| 164 | +# Is there any other information you would like to see in the describe output? |
| 165 | +ignore_python.describe() |
| 166 | +modified_graph.describe() |
| 167 | + |
| 168 | + |
| 169 | +# The last advanced method we have to show you is collapse graph. This quickly creates a one-mode network, using *mode1* of the |
| 170 | +# original graph object. |
| 171 | + |
| 172 | +# Try calling one of the advanced graph methods, such as *collapse_edges* |
| 173 | +basic_graph = logs_tensor.generate_network('author', 'files') |
| 174 | +# Sum_weights = True is an optional argument that creates a weighted multigraph. |
| 175 | +collapsed_graph = basic_graph.collapse_edges(sum_weights = True) |
| 176 | +collapsed_graph.quickplot(fname = "ok_net.pdf") |
| 177 | + |
| 178 | + |
| 179 | +# Optional: try reading an output file into R. |
| 180 | +# |
| 181 | +# Use the edge list created earlier, or create a new *tnet file* or *graphml file* and try reading it into R. |
| 182 | + |
| 183 | +# The graphml file will be saved at the directed path, while the tnet file will be saved in the current directory. |
| 184 | +basic_graph.write_tnet('filename') |
| 185 | +basic_graph.write_graphml('path/to/file') |
| 186 | + |
| 187 | + |
| 188 | +# If you prefer, you can use two columns of the TSV file as the 'source' and 'target' of a networkx graph object in R. |
0 commit comments